diff options
Diffstat (limited to 'src/backend/utils/mb/wchar.c')
-rw-r--r-- | src/backend/utils/mb/wchar.c | 648 |
1 files changed, 554 insertions, 94 deletions
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index 3adf4481e8d..96a26886959 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -1,7 +1,7 @@ /* * conversion functions between pg_wchar and multibyte streams. * Tatsuo Ishii - * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.40.4.1 2005/12/24 10:11:32 ishii Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.40.4.2 2006/05/21 20:06:16 tgl Exp $ * * WIN1250 client encoding updated by Pavel Behal * @@ -96,7 +96,7 @@ static int pg_euc2wchar_with_len return (cnt); } -static int +static inline int pg_euc_mblen(const unsigned char *s) { int len; @@ -112,7 +112,7 @@ pg_euc_mblen(const unsigned char *s) return (len); } -static int +static inline int pg_euc_dsplen(const unsigned char *s) { int len; @@ -714,53 +714,433 @@ pg_gb18030_dsplen(const unsigned char *s) return (len); } +/* + *------------------------------------------------------------------- + * multibyte sequence validators + * + * These functions accept "s", a pointer to the first byte of a string, + * and "len", the remaining length of the string. If there is a validly + * encoded character beginning at *s, return its length in bytes; else + * return -1. + * + * The functions can assume that len > 0 and that *s != '\0', but they must + * test for and reject zeroes in any additional bytes of a multibyte character. + * + * Note that this definition allows the function for a single-byte + * encoding to be just "return 1". + *------------------------------------------------------------------- + */ + +static int +pg_ascii_verifier(const unsigned char *s, int len) +{ + return 1; +} + +#define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe) + +static int +pg_eucjp_verifier(const unsigned char *s, int len) +{ + int l; + unsigned char c1, c2; + + c1 = *s++; + + switch (c1) + { + case SS2: /* JIS X 0201 */ + l = 2; + if (l > len) + return -1; + c2 = *s++; + if (c2 < 0xa1 || c2 > 0xdf) + return -1; + break; + + case SS3: /* JIS X 0212 */ + l = 3; + if (l > len) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + break; + + default: + if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */ + { + l = 2; + if (l > len) + return -1; + if (!IS_EUC_RANGE_VALID(c1)) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + } + else /* must be ASCII */ + { + l = 1; + } + break; + } + + return l; +} + +static int +pg_euckr_verifier(const unsigned char *s, int len) +{ + int l; + unsigned char c1, c2; + + c1 = *s++; + + if (IS_HIGHBIT_SET(c1)) + { + l = 2; + if (l > len) + return -1; + if (!IS_EUC_RANGE_VALID(c1)) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + } + else /* must be ASCII */ + { + l = 1; + } + + return l; +} + +/* EUC-CN byte sequences are exactly same as EUC-KR */ +#define pg_euccn_verifier pg_euckr_verifier + +static int +pg_euctw_verifier(const unsigned char *s, int len) +{ + int l; + unsigned char c1, c2; + + c1 = *s++; + + switch (c1) + { + case SS2: /* CNS 11643 Plane 1-7 */ + l = 4; + if (l > len) + return -1; + c2 = *s++; + if (c2 < 0xa1 || c2 > 0xa7) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + break; + + case SS3: /* unused */ + return -1; + + default: + if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */ + { + l = 2; + if (l > len) + return -1; + /* no further range check on c1? */ + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + } + else /* must be ASCII */ + { + l = 1; + } + break; + } + return l; +} + +static int +pg_johab_verifier(const unsigned char *s, int len) +{ + int l, mbl; + unsigned char c; + + l = mbl = pg_johab_mblen(s); + + if (len < l) + return -1; + + if (!IS_HIGHBIT_SET(*s)) + return mbl; + + while (--l > 0) + { + c = *++s; + if (!IS_EUC_RANGE_VALID(c)) + return -1; + } + return mbl; +} + +static int +pg_mule_verifier(const unsigned char *s, int len) +{ + int l, mbl; + unsigned char c; + + l = mbl = pg_mule_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + c = *++s; + if (!IS_HIGHBIT_SET(c)) + return -1; + } + return mbl; +} + +static int +pg_latin1_verifier(const unsigned char *s, int len) +{ + return 1; +} + +static int +pg_sjis_verifier(const unsigned char *s, int len) +{ + int l, mbl; + unsigned char c1, c2; + + l = mbl = pg_sjis_mblen(s); + + if (len < l) + return -1; + + if (l == 1) /* pg_sjis_mblen already verified it */ + return mbl; + + c1 = *s++; + c2 = *s; + if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2)) + return -1; + return mbl; +} + +static int +pg_big5_verifier(const unsigned char *s, int len) +{ + int l, mbl; + + l = mbl = pg_big5_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + if (*++s == '\0') + return -1; + } + + return mbl; +} + +static int +pg_gbk_verifier(const unsigned char *s, int len) +{ + int l, mbl; + + l = mbl = pg_gbk_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + if (*++s == '\0') + return -1; + } + + return mbl; +} + +static int +pg_uhc_verifier(const unsigned char *s, int len) +{ + int l, mbl; + + l = mbl = pg_uhc_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + if (*++s == '\0') + return -1; + } + + return mbl; +} + +static int +pg_gb18030_verifier(const unsigned char *s, int len) +{ + int l, mbl; + + l = mbl = pg_gb18030_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + if (*++s == '\0') + return -1; + } + + return mbl; +} + +static int +pg_utf8_verifier(const unsigned char *s, int len) +{ + int l = pg_utf_mblen(s); + + if (len < l) + return -1; + + if (!pg_utf8_islegal(s, l)) + return -1; + return l; +} + +/* + * Check for validity of a single UTF-8 encoded character + * + * This directly implements the rules in RFC3629, modified to restrict + * us to 16-bit Unicode code points (hence, at most 3 bytes in UTF8). + * The bizarre-looking + * restrictions on the second byte are meant to ensure that there isn't + * more than one encoding of a given Unicode character point; that is, + * you may not use a longer-than-necessary byte sequence with high order + * zero bits to represent a character that would fit in fewer bytes. + * To do otherwise is to create security hazards (eg, create an apparent + * non-ASCII character that decodes to plain ASCII). + * + * length is assumed to have been obtained by pg_utf_mblen(), and the + * caller must have checked that that many bytes are present in the buffer. + */ +bool +pg_utf8_islegal(const unsigned char *source, int length) +{ + unsigned char a; + + switch (length) + { + default: + /* reject lengths 4, 5 and 6 for now */ + return false; + case 3: + a = source[2]; + if (a < 0x80 || a > 0xBF) + return false; + /* FALL THRU */ + case 2: + a = source[1]; + switch (*source) + { + case 0xE0: + if (a < 0xA0 || a > 0xBF) + return false; + break; + case 0xED: + if (a < 0x80 || a > 0x9F) + return false; + break; + default: + if (a < 0x80 || a > 0xBF) + return false; + break; + } + /* FALL THRU */ + case 1: + a = *source; + if (a >= 0x80 && a < 0xC2) + return false; + if (a > 0xEF) + return false; + break; + } + return true; +} + +/* + *------------------------------------------------------------------- + * encoding info table + *------------------------------------------------------------------- + */ pg_wchar_tbl pg_wchar_table[] = { - {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, 1}, /* 0; PG_SQL_ASCII */ - {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, 3}, /* 1; PG_EUC_JP */ - {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, 3}, /* 2; PG_EUC_CN */ - {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */ - {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */ - {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */ - {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3}, /* 6; PG_UNICODE */ - {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 10; PG_LATIN3 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 11; PG_LATIN4 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 12; PG_LATIN5 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 13; PG_LATIN6 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 14; PG_LATIN7 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 15; PG_LATIN8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 16; PG_LATIN9 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 17; PG_LATIN10 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 18; PG_WIN1256 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 19; PG_TCVN */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 20; PG_WIN874 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 21; PG_KOI8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 22; PG_WIN1251 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 23; PG_ALT */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 24; ISO-8859-5 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 25; ISO-8859-6 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 26; ISO-8859-7 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 27; ISO-8859-8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 28; PG_WIN1250 */ - {0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 29; PG_SJIS */ - {0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 30; PG_BIG5 */ - {0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 31; PG_GBK */ - {0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 32; PG_UHC */ - {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */ + {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* 0; PG_SQL_ASCII */ + {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* 1; PG_EUC_JP */ + {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 3}, /* 2; PG_EUC_CN */ + {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* 3; PG_EUC_KR */ + {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 3}, /* 4; PG_EUC_TW */ + {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* 5; PG_JOHAB */ + {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 3}, /* 6; PG_UNICODE */ + {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 3}, /* 7; PG_MULE_INTERNAL */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 8; PG_LATIN1 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 9; PG_LATIN2 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 10; PG_LATIN3 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 11; PG_LATIN4 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 12; PG_LATIN5 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 13; PG_LATIN6 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 14; PG_LATIN7 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 15; PG_LATIN8 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 16; PG_LATIN9 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 17; PG_LATIN10 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 18; PG_WIN1256 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 19; PG_TCVN */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 20; PG_WIN874 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 21; PG_KOI8 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 22; PG_WIN1251 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 23; PG_ALT */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 24; ISO-8859-5 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 25; ISO-8859-6 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 26; ISO-8859-7 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 27; ISO-8859-8 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 28; PG_WIN1250 */ + {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* 29; PG_SJIS */ + {0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* 30; PG_BIG5 */ + {0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* 31; PG_GBK */ + {0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* 32; PG_UHC */ + {0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 2} /* 33; PG_GB18030 */ }; /* returns the byte length of a word for mule internal code */ int pg_mic_mblen(const unsigned char *mbstr) { - return (pg_mule_mblen(mbstr)); + return pg_mule_mblen(mbstr); } /* - * Returns the byte length of a multibyte word. + * Returns the byte length of a multibyte character. */ int pg_encoding_mblen(int encoding, const unsigned char *mbstr) @@ -769,12 +1149,12 @@ pg_encoding_mblen(int encoding, const unsigned char *mbstr) return ((encoding >= 0 && encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? - ((*pg_wchar_table[encoding].mblen) (mbstr)) : - ((*pg_wchar_table[PG_SQL_ASCII].mblen) (mbstr))); + ((*pg_wchar_table[encoding].mblen) ((const unsigned char *) mbstr)) : + ((*pg_wchar_table[PG_SQL_ASCII].mblen) ((const unsigned char *) mbstr))); } /* - * Returns the display length of a multibyte word. + * Returns the display length of a multibyte character. */ int pg_encoding_dsplen(int encoding, const unsigned char *mbstr) @@ -783,12 +1163,28 @@ pg_encoding_dsplen(int encoding, const unsigned char *mbstr) return ((encoding >= 0 && encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? - ((*pg_wchar_table[encoding].dsplen) (mbstr)) : - ((*pg_wchar_table[PG_SQL_ASCII].dsplen) (mbstr))); + ((*pg_wchar_table[encoding].dsplen) ((const unsigned char *) mbstr)) : + ((*pg_wchar_table[PG_SQL_ASCII].dsplen) ((const unsigned char *) mbstr))); +} + +/* + * Verify the first multibyte character of the given string. + * Return its byte length if good, -1 if bad. (See comments above for + * full details of the mbverify API.) + */ +int +pg_encoding_verifymb(int encoding, const char *mbstr, int len) +{ + Assert(PG_VALID_ENCODING(encoding)); + + return ((encoding >= 0 && + encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? + ((*pg_wchar_table[encoding].mbverify) ((const unsigned char *) mbstr, len)) : + ((*pg_wchar_table[PG_SQL_ASCII].mbverify) ((const unsigned char *) mbstr, len))); } /* - * fetch maximum length of a char encoding + * fetch maximum length of a given encoding */ int pg_encoding_max_length(int encoding) @@ -801,83 +1197,147 @@ pg_encoding_max_length(int encoding) #ifndef FRONTEND /* - * Verify mbstr to make sure that it has a valid character sequence. - * mbstr is not necessarily NULL terminated; length of mbstr is + * fetch maximum length of the encoding for the current database + */ +int +pg_database_encoding_max_length(void) +{ + return pg_wchar_table[GetDatabaseEncoding()].maxmblen; +} + +/* + * Verify mbstr to make sure that it is validly encoded in the current + * database encoding. Otherwise same as pg_verify_mbstr(). + */ +bool +pg_verifymbstr(const char *mbstr, int len, bool noError) +{ + return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError); +} + +/* + * Verify mbstr to make sure that it is validly encoded in the specified + * encoding. + * + * mbstr is not necessarily zero terminated; length of mbstr is * specified by len. * * If OK, return TRUE. If a problem is found, return FALSE when noError is * true; when noError is false, ereport() a descriptive message. */ bool -pg_verifymbstr(const unsigned char *mbstr, int len, bool noError) +pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError) { - int l; - int i; - int encoding; + mbverifier mbverify; + + Assert(PG_VALID_ENCODING(encoding)); - /* we do not need any check in single-byte encodings */ - if (pg_database_encoding_max_length() <= 1) - return true; + /* + * In single-byte encodings, we need only reject nulls (\0). + */ + if (pg_encoding_max_length(encoding) <= 1) + { + const char *nullpos = memchr(mbstr, 0, len); - encoding = GetDatabaseEncoding(); + if (nullpos == NULL) + return true; + if (noError) + return false; + report_invalid_encoding(encoding, nullpos, 1); + } - while (len > 0 && *mbstr) + /* fetch function pointer just once */ + mbverify = pg_wchar_table[encoding].mbverify; + + while (len > 0) { - /* special UTF-8 check */ - if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0) + int l; + + /* fast path for ASCII-subset characters */ + if (!IS_HIGHBIT_SET(*mbstr)) { + if (*mbstr != '\0') + { + mbstr++; + len--; + continue; + } if (noError) return false; - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("Unicode characters greater than or equal to 0x10000 are not supported"))); + report_invalid_encoding(encoding, mbstr, len); } - l = pg_mblen(mbstr); + l = (*mbverify) ((const unsigned char *) mbstr, len); - for (i = 1; i < l; i++) + if (l < 0) { - /* - * we expect that every multibyte char consists of bytes - * having the 8th bit set - */ - if (i >= len || (mbstr[i] & 0x80) == 0) - { - char buf[8 * 2 + 1]; - char *p = buf; - int j, - jlimit; - - if (noError) - return false; - - jlimit = Min(l, len); - jlimit = Min(jlimit, 8); /* prevent buffer overrun */ - - for (j = 0; j < jlimit; j++) - p += sprintf(p, "%02x", mbstr[j]); - - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("invalid byte sequence for encoding \"%s\": 0x%s", - GetDatabaseEncodingName(), buf))); - } + if (noError) + return false; + report_invalid_encoding(encoding, mbstr, len); } - len -= l; mbstr += l; + len -= l; } - return true; } /* - * fetch maximum length of a char encoding for the current database + * report_invalid_encoding: complain about invalid multibyte character + * + * note: len is remaining length of string, not length of character; + * len must be greater than zero, as we always examine the first byte. */ -int -pg_database_encoding_max_length(void) +void +report_invalid_encoding(int encoding, const char *mbstr, int len) { - return pg_wchar_table[GetDatabaseEncoding()].maxmblen; + int l = pg_encoding_mblen(encoding, mbstr); + char buf[8 * 2 + 1]; + char *p = buf; + int j, + jlimit; + + jlimit = Min(l, len); + jlimit = Min(jlimit, 8); /* prevent buffer overrun */ + + for (j = 0; j < jlimit; j++) + p += sprintf(p, "%02x", (unsigned char) mbstr[j]); + + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid byte sequence for encoding \"%s\": 0x%s", + pg_enc2name_tbl[encoding].name, + buf))); +} + +/* + * report_untranslatable_char: complain about untranslatable character + * + * note: len is remaining length of string, not length of character; + * len must be greater than zero, as we always examine the first byte. + */ +void +report_untranslatable_char(int src_encoding, int dest_encoding, + const char *mbstr, int len) +{ + int l = pg_encoding_mblen(src_encoding, mbstr); + char buf[8 * 2 + 1]; + char *p = buf; + int j, + jlimit; + + jlimit = Min(l, len); + jlimit = Min(jlimit, 8); /* prevent buffer overrun */ + + for (j = 0; j < jlimit; j++) + p += sprintf(p, "%02x", (unsigned char) mbstr[j]); + + ereport(ERROR, + (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), + errmsg("character 0x%s of encoding \"%s\" has no equivalent in \"%s\"", + buf, + pg_enc2name_tbl[src_encoding].name, + pg_enc2name_tbl[dest_encoding].name))); } #endif |