diff options
-rw-r--r-- | src/backend/utils/mb/wchar.c | 60 |
1 files changed, 43 insertions, 17 deletions
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index 6828fce7d3a..6f72a7dbb69 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -1,7 +1,7 @@ /* * conversion functions between pg_wchar and multibyte streams. * Tatsuo Ishii - * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.58 2006/10/04 00:30:02 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.59 2007/01/24 17:12:17 tgl Exp $ * * WIN1250 client encoding updated by Pavel Behal * @@ -364,46 +364,60 @@ pg_johab_dsplen(const unsigned char *s) } /* - * convert UTF8 string to pg_wchar (UCS-2) - * caller should allocate enough space for "to" + * convert UTF8 string to pg_wchar (UCS-4) + * caller must allocate enough space for "to", including a trailing zero! * len: length of from. * "from" not necessarily null terminated. */ static int pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) { - unsigned char c1, - c2, - c3; int cnt = 0; + uint32 c1, + c2, + c3, + c4; while (len > 0 && *from) { - if (!IS_HIGHBIT_SET(*from)) + if ((*from & 0x80) == 0) { *to = *from++; len--; } - else if ((*from & 0xe0) == 0xc0 && len >= 2) + else if ((*from & 0xe0) == 0xc0) { + if (len < 2) + break; /* drop trailing incomplete char */ c1 = *from++ & 0x1f; c2 = *from++ & 0x3f; - *to = c1 << 6; - *to |= c2; + *to = (c1 << 6) | c2; len -= 2; } - else if ((*from & 0xe0) == 0xe0 && len >= 3) + else if ((*from & 0xf0) == 0xe0) { + if (len < 3) + break; /* drop trailing incomplete char */ c1 = *from++ & 0x0f; c2 = *from++ & 0x3f; c3 = *from++ & 0x3f; - *to = c1 << 12; - *to |= c2 << 6; - *to |= c3; + *to = (c1 << 12) | (c2 << 6) | c3; len -= 3; } + else if ((*from & 0xf8) == 0xf0) + { + if (len < 4) + break; /* drop trailing incomplete char */ + c1 = *from++ & 0x07; + c2 = *from++ & 0x3f; + c3 = *from++ & 0x3f; + c4 = *from++ & 0x3f; + *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4; + len -= 4; + } else { + /* treat a bogus char as length 1; not ours to raise error */ *to = *from++; len--; } @@ -415,12 +429,20 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) } /* - * returns the byte length of a UTF8 character pointed to by s + * Return the byte length of a UTF8 character pointed to by s + * + * Note: in the current implementation we do not support UTF8 sequences + * of more than 4 bytes; hence do NOT return a value larger than 4. + * We return "1" for any leading byte that is either flat-out illegal or + * indicates a length larger than we support. + * + * pg_utf2wchar_with_len(), utf2ucs(), pg_utf8_islegal(), and perhaps + * other places would need to be fixed to change this. */ int pg_utf_mblen(const unsigned char *s) { - int len = 1; + int len; if ((*s & 0x80) == 0) len = 1; @@ -430,10 +452,14 @@ pg_utf_mblen(const unsigned char *s) len = 3; else if ((*s & 0xf8) == 0xf0) len = 4; +#ifdef NOT_USED else if ((*s & 0xfc) == 0xf8) len = 5; else if ((*s & 0xfe) == 0xfc) len = 6; +#endif + else + len = 1; return len; } @@ -596,7 +622,7 @@ utf2ucs(const unsigned char *c) return (pg_wchar) (((c[0] & 0x0f) << 12) | ((c[1] & 0x3f) << 6) | (c[2] & 0x3f)); - else if ((*c & 0xf0) == 0xf0) + else if ((*c & 0xf8) == 0xf0) return (pg_wchar) (((c[0] & 0x07) << 18) | ((c[1] & 0x3f) << 12) | ((c[2] & 0x3f) << 6) | |