diff options
31 files changed, 1527 insertions, 932 deletions
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 2b49094bea0..6678522dc73 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.263 2006/04/05 22:11:54 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.264 2006/05/21 20:05:19 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1023,9 +1023,15 @@ DoCopy(const CopyStmt *stmt) cstate->raw_buf_index = cstate->raw_buf_len = 0; cstate->processed = 0; - /* Set up encoding conversion info */ + /* + * Set up encoding conversion info. Even if the client and server + * encodings are the same, we must apply pg_client_to_server() to + * validate data in multibyte encodings. + */ cstate->client_encoding = pg_get_client_encoding(); - cstate->need_transcoding = (cstate->client_encoding != GetDatabaseEncoding()); + cstate->need_transcoding = + (cstate->client_encoding != GetDatabaseEncoding() || + pg_database_encoding_max_length() > 1); /* See Multibyte encoding comment above */ cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->client_encoding); diff --git a/src/backend/utils/adt/name.c b/src/backend/utils/adt/name.c index ed6f891fe27..aa53fb05319 100644 --- a/src/backend/utils/adt/name.c +++ b/src/backend/utils/adt/name.c @@ -14,7 +14,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/name.c,v 1.57 2006/03/05 15:58:43 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/name.c,v 1.58 2006/05/21 20:05:19 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -49,10 +49,7 @@ namein(PG_FUNCTION_ARGS) NameData *result; int len; - /* verify encoding */ len = strlen(s); - pg_verifymbstr(s, len, false); - len = pg_mbcliplen(s, len, NAMEDATALEN - 1); result = (NameData *) palloc0(NAMEDATALEN); diff --git a/src/backend/utils/adt/varchar.c b/src/backend/utils/adt/varchar.c index f72b3726488..c6ea11d381e 100644 --- a/src/backend/utils/adt/varchar.c +++ b/src/backend/utils/adt/varchar.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/varchar.c,v 1.115 2006/03/05 15:58:44 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/varchar.c,v 1.116 2006/05/21 20:05:19 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -73,9 +73,6 @@ bpchar_input(const char *s, size_t len, int32 atttypmod) char *r; size_t maxlen; - /* verify encoding */ - pg_verifymbstr(s, len, false); - /* If typmod is -1 (or invalid), use the actual string length */ if (atttypmod < (int32) VARHDRSZ) maxlen = len; @@ -393,9 +390,6 @@ varchar_input(const char *s, size_t len, int32 atttypmod) VarChar *result; size_t maxlen; - /* verify encoding */ - pg_verifymbstr(s, len, false); - maxlen = atttypmod - VARHDRSZ; if (atttypmod >= (int32) VARHDRSZ && len > maxlen) diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 79dc0178a80..7bc5a09f693 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.146 2006/04/04 19:35:36 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.147 2006/05/21 20:05:19 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -256,10 +256,7 @@ textin(PG_FUNCTION_ARGS) text *result; int len; - /* verify encoding */ len = strlen(inputText); - pg_verifymbstr(inputText, len, false); - result = (text *) palloc(len + VARHDRSZ); VARATT_SIZEP(result) = len + VARHDRSZ; @@ -299,9 +296,6 @@ textrecv(PG_FUNCTION_ARGS) str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes); - /* verify encoding */ - pg_verifymbstr(str, nbytes, false); - result = (text *) palloc(nbytes + VARHDRSZ); VARATT_SIZEP(result) = nbytes + VARHDRSZ; memcpy(VARDATA(result), str, nbytes); diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c index a544f107264..deaf912ed00 100644 --- a/src/backend/utils/mb/conv.c +++ b/src/backend/utils/mb/conv.c @@ -6,170 +6,81 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.59 2006/03/05 15:58:46 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.60 2006/05/21 20:05:19 tgl Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" #include "mb/pg_wchar.h" -/* - * convert bogus chars that cannot be represented in the current - * encoding system. - */ -void -pg_print_bogus_char(unsigned char **mic, unsigned char **p) -{ - char strbuf[16]; - int l = pg_mic_mblen(*mic); - - *(*p)++ = '('; - while (l--) - { - sprintf(strbuf, "%02x", *(*mic)++); - *(*p)++ = strbuf[0]; - *(*p)++ = strbuf[1]; - } - *(*p)++ = ')'; -} - -#ifdef NOT_USED /* - * GB18030 ---> MIC - * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp> - */ -static void -gb180302mic(unsigned char *gb18030, unsigned char *p, int len) -{ - int c1; - int c2; - - while (len > 0 && (c1 = *gb18030++)) - { - if (c1 < 0x80) - { /* should be ASCII */ - len--; - *p++ = c1; - } - else if (c1 >= 0x81 && c1 <= 0xfe) - { - c2 = *gb18030++; - - if (c2 >= 0x30 && c2 <= 0x69) - { - len -= 4; - *p++ = c1; - *p++ = c2; - *p++ = *gb18030++; - *p++ = *gb18030++; - *p++ = *gb18030++; - } - else if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe)) - { - len -= 2; - *p++ = c1; - *p++ = c2; - *p++ = *gb18030++; - } - else - { /* throw the strange code */ - len--; - } - } - } - *p = '\0'; -} - -/* - * MIC ---> GB18030 - * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp> - */ -static void -mic2gb18030(unsigned char *mic, unsigned char *p, int len) -{ - int c1; - int c2; - - while (len > 0 && (c1 = *mic)) - { - len -= pg_mic_mblen(mic++); - - if (!IS_HIGHBIT_SET(c1)) /* ASCII */ - *p++ = c1; - else if (c1 >= 0x81 && c1 <= 0xfe) - { - c2 = *mic++; - - if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe)) - { - *p++ = c1; - *p++ = c2; - } - else if (c2 >= 0x30 && c2 <= 0x39) - { - *p++ = c1; - *p++ = c2; - *p++ = *mic++; - *p++ = *mic++; - } - else - { - mic--; - pg_print_bogus_char(&mic, &p); - mic--; - pg_print_bogus_char(&mic, &p); - } - } - else - { - mic--; - pg_print_bogus_char(&mic, &p); - } - } - *p = '\0'; -} -#endif - -/* - * LATINn ---> MIC + * LATINn ---> MIC when the charset's local codes map directly to MIC + * + * l points to the source string of length len + * p is the output area (must be large enough!) + * lc is the mule character set id for the local encoding + * encoding is the PG identifier for the local encoding */ void -latin2mic(unsigned char *l, unsigned char *p, int len, int lc) +latin2mic(const unsigned char *l, unsigned char *p, int len, + int lc, int encoding) { int c1; - while (len-- > 0 && (c1 = *l++)) + while (len > 0) { + c1 = *l; + if (c1 == 0) + report_invalid_encoding(encoding, (const char *) l, len); if (IS_HIGHBIT_SET(c1)) - *p++ = lc; /* Latin? */ + *p++ = lc; *p++ = c1; + l++; + len--; } *p = '\0'; } /* - * MIC ---> LATINn + * MIC ---> LATINn when the charset's local codes map directly to MIC + * + * mic points to the source string of length len + * p is the output area (must be large enough!) + * lc is the mule character set id for the local encoding + * encoding is the PG identifier for the local encoding */ void -mic2latin(unsigned char *mic, unsigned char *p, int len, int lc) +mic2latin(const unsigned char *mic, unsigned char *p, int len, + int lc, int encoding) { int c1; - while (len > 0 && (c1 = *mic)) + while (len > 0) { - len -= pg_mic_mblen(mic++); - - if (c1 == lc) - *p++ = *mic++; - else if (IS_HIGHBIT_SET(c1)) + c1 = *mic; + if (c1 == 0) + report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + if (!IS_HIGHBIT_SET(c1)) { - mic--; - pg_print_bogus_char(&mic, &p); + /* easy for ASCII */ + *p++ = c1; + mic++; + len--; } else - { /* should be ASCII */ - *p++ = c1; + { + int l = pg_mic_mblen(mic); + + if (len < l) + report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, + len); + if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1])) + report_untranslatable_char(PG_MULE_INTERNAL, encoding, + (const char *) mic, len); + *p++ = mic[1]; + mic += 2; + len -= 2; } } *p = '\0'; @@ -178,14 +89,25 @@ mic2latin(unsigned char *mic, unsigned char *p, int len, int lc) /* * ASCII ---> MIC + * + * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set + * characters, here we must take a hard line because we don't know + * the appropriate MIC equivalent. */ void -pg_ascii2mic(unsigned char *l, unsigned char *p, int len) +pg_ascii2mic(const unsigned char *l, unsigned char *p, int len) { int c1; - while (len-- > 0 && (c1 = *l++)) - *p++ = (c1 & 0x7f); + while (len > 0) + { + c1 = *l; + if (c1 == 0 || IS_HIGHBIT_SET(c1)) + report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len); + *p++ = c1; + l++; + len--; + } *p = '\0'; } @@ -193,19 +115,19 @@ pg_ascii2mic(unsigned char *l, unsigned char *p, int len) * MIC ---> ASCII */ void -pg_mic2ascii(unsigned char *mic, unsigned char *p, int len) +pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len) { int c1; - while (len-- > 0 && (c1 = *mic)) + while (len > 0) { - if (IS_HIGHBIT_SET(c1)) - pg_print_bogus_char(&mic, &p); - else - { /* should be ASCII */ - *p++ = c1; - mic++; - } + c1 = *mic; + if (c1 == 0 || IS_HIGHBIT_SET(c1)) + report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII, + (const char *) mic, len); + *p++ = c1; + mic++; + len--; } *p = '\0'; } @@ -213,86 +135,103 @@ pg_mic2ascii(unsigned char *mic, unsigned char *p, int len) /* * latin2mic_with_table: a generic single byte charset encoding * conversion from a local charset to the mule internal code. - * with a encoding conversion table. - * the table is ordered according to the local charset, + * + * l points to the source string of length len + * p is the output area (must be large enough!) + * lc is the mule character set id for the local encoding + * encoding is the PG identifier for the local encoding + * tab holds conversion entries for the local charset * starting from 128 (0x80). each entry in the table * holds the corresponding code point for the mule internal code. */ void -latin2mic_with_table( - unsigned char *l, /* local charset string (source) */ - unsigned char *p, /* pointer to store mule internal code - * (destination) */ - int len, /* length of l */ - int lc, /* leading character of p */ - unsigned char *tab /* code conversion table */ -) +latin2mic_with_table(const unsigned char *l, + unsigned char *p, + int len, + int lc, + int encoding, + const unsigned char *tab) { unsigned char c1, c2; - while (len-- > 0 && (c1 = *l++)) + while (len > 0) { - if (c1 < 128) + c1 = *l; + if (c1 == 0) + report_invalid_encoding(encoding, (const char *) l, len); + if (!IS_HIGHBIT_SET(c1)) *p++ = c1; else { - c2 = tab[c1 - 128]; + c2 = tab[c1 - HIGHBIT]; if (c2) { *p++ = lc; *p++ = c2; } else - { - *p++ = ' '; /* cannot convert */ - } + report_untranslatable_char(encoding, PG_MULE_INTERNAL, + (const char *) l, len); } + l++; + len--; } *p = '\0'; } /* * mic2latin_with_table: a generic single byte charset encoding - * conversion from the mule internal code to a local charset - * with a encoding conversion table. - * the table is ordered according to the second byte of the mule - * internal code starting from 128 (0x80). - * each entry in the table - * holds the corresponding code point for the local code. + * conversion from the mule internal code to a local charset. + * + * mic points to the source string of length len + * p is the output area (must be large enough!) + * lc is the mule character set id for the local encoding + * encoding is the PG identifier for the local encoding + * tab holds conversion entries for the mule internal code's + * second byte, starting from 128 (0x80). each entry in the table + * holds the corresponding code point for the local charset. */ void -mic2latin_with_table( - unsigned char *mic, /* mule internal code (source) */ - unsigned char *p, /* local code (destination) */ - int len, /* length of p */ - int lc, /* leading character */ - unsigned char *tab /* code conversion table */ -) +mic2latin_with_table(const unsigned char *mic, + unsigned char *p, + int len, + int lc, + int encoding, + const unsigned char *tab) { - unsigned char c1, c2; - while (len-- > 0 && (c1 = *mic++)) + while (len > 0) { - if (c1 < 128) - *p++ = c1; - else if (c1 == lc) + c1 = *mic; + if (c1 == 0) + report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + if (!IS_HIGHBIT_SET(c1)) { - c1 = *mic++; + /* easy for ASCII */ + *p++ = c1; + mic++; len--; - c2 = tab[c1 - 128]; - if (c2) - *p++ = c2; - else - { - *p++ = ' '; /* cannot convert */ - } } else { - *p++ = ' '; /* bogus character */ + int l = pg_mic_mblen(mic); + + if (len < l) + report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, + len); + if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) || + (c2 = tab[mic[1] - HIGHBIT]) == 0) + { + report_untranslatable_char(PG_MULE_INTERNAL, encoding, + (const char *) mic, len); + break; /* keep compiler quiet */ + } + *p++ = c2; + mic += 2; + len -= 2; } } *p = '\0'; @@ -331,25 +270,38 @@ compare2(const void *p1, const void *p2) /* * UTF8 ---> local code * - * utf: input UTF8 string. Its length is limited by "len" parameter - * or a null terminator. - * iso: pointer to the output. + * utf: input UTF8 string (need not be null-terminated). + * iso: pointer to the output area (must be large enough!) * map: the conversion map. * size: the size of the conversion map. + * encoding: the PG identifier for the local encoding. + * len: length of input string. */ void -UtfToLocal(unsigned char *utf, unsigned char *iso, - pg_utf_to_local *map, int size, int len) +UtfToLocal(const unsigned char *utf, unsigned char *iso, + const pg_utf_to_local *map, int size, int encoding, int len) { unsigned int iutf; int l; pg_utf_to_local *p; - for (; len > 0 && *utf; len -= l) + for (; len > 0; len -= l) { + /* "break" cases all represent errors */ + if (*utf == '\0') + break; + l = pg_utf_mblen(utf); + + if (len < l) + break; + + if (!pg_utf8_islegal(utf, l)) + break; + if (l == 1) { + /* ASCII case is easy */ *iso++ = *utf++; continue; } @@ -371,16 +323,14 @@ UtfToLocal(unsigned char *utf, unsigned char *iso, iutf |= *utf++ << 8; iutf |= *utf++; } + p = bsearch(&iutf, map, size, sizeof(pg_utf_to_local), compare1); + if (p == NULL) - { - ereport(WARNING, - (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), - errmsg("ignoring unconvertible UTF-8 character 0x%04x", - iutf))); - continue; - } + report_untranslatable_char(PG_UTF8, encoding, + (const char *) (utf - l), len); + if (p->code & 0xff000000) *iso++ = p->code >> 24; if (p->code & 0x00ff0000) @@ -390,15 +340,26 @@ UtfToLocal(unsigned char *utf, unsigned char *iso, if (p->code & 0x000000ff) *iso++ = p->code & 0x000000ff; } + + if (len > 0) + report_invalid_encoding(PG_UTF8, (const char *) utf, len); + *iso = '\0'; } /* * local code ---> UTF8 + * + * iso: input local string (need not be null-terminated). + * utf: pointer to the output area (must be large enough!) + * map: the conversion map. + * size: the size of the conversion map. + * encoding: the PG identifier for the local encoding. + * len: length of input string. */ void -LocalToUtf(unsigned char *iso, unsigned char *utf, - pg_local_to_utf *map, int size, int encoding, int len) +LocalToUtf(const unsigned char *iso, unsigned char *utf, + const pg_local_to_utf *map, int size, int encoding, int len) { unsigned int iiso; int l; @@ -409,16 +370,23 @@ LocalToUtf(unsigned char *iso, unsigned char *utf, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid encoding number: %d", encoding))); - for (; len > 0 && *iso; len -= l) + for (; len > 0; len -= l) { + /* "break" cases all represent errors */ + if (*iso == '\0') + break; + if (!IS_HIGHBIT_SET(*iso)) { + /* ASCII case is easy */ *utf++ = *iso++; l = 1; continue; } - l = pg_encoding_mblen(encoding, (char *) iso); + l = pg_encoding_verifymb(encoding, (const char *) iso, len); + if (l < 0) + break; if (l == 1) iiso = *iso++; @@ -440,16 +408,13 @@ LocalToUtf(unsigned char *iso, unsigned char *utf, iiso |= *iso++ << 8; iiso |= *iso++; } + p = bsearch(&iiso, map, size, sizeof(pg_local_to_utf), compare2); if (p == NULL) - { - ereport(WARNING, - (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), - errmsg("ignoring unconvertible %s character 0x%04x", - (&pg_enc2name_tbl[encoding])->name, iiso))); - continue; - } + report_untranslatable_char(encoding, PG_UTF8, + (const char *) (iso - l), len); + if (p->utf & 0xff000000) *utf++ = p->utf >> 24; if (p->utf & 0x00ff0000) @@ -459,5 +424,9 @@ LocalToUtf(unsigned char *iso, unsigned char *utf, if (p->utf & 0x000000ff) *utf++ = p->utf & 0x000000ff; } + + if (len > 0) + report_invalid_encoding(encoding, (const char *) iso, len); + *utf = '\0'; } diff --git a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c index 544530d5610..57fc2d7a6fa 100644 --- a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c +++ b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c,v 1.12 2006/03/05 15:58:47 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c,v 1.13 2006/05/21 20:05:19 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -70,14 +70,14 @@ extern Datum win866_to_iso(PG_FUNCTION_ARGS); * ---------- */ -static void koi8r2mic(unsigned char *l, unsigned char *p, int len); -static void mic2koi8r(unsigned char *mic, unsigned char *p, int len); -static void iso2mic(unsigned char *l, unsigned char *p, int len); -static void mic2iso(unsigned char *mic, unsigned char *p, int len); -static void win12512mic(unsigned char *l, unsigned char *p, int len); -static void mic2win1251(unsigned char *mic, unsigned char *p, int len); -static void win8662mic(unsigned char *l, unsigned char *p, int len); -static void mic2win866(unsigned char *mic, unsigned char *p, int len); +static void koi8r2mic(const unsigned char *l, unsigned char *p, int len); +static void mic2koi8r(const unsigned char *mic, unsigned char *p, int len); +static void iso2mic(const unsigned char *l, unsigned char *p, int len); +static void mic2iso(const unsigned char *mic, unsigned char *p, int len); +static void win12512mic(const unsigned char *l, unsigned char *p, int len); +static void mic2win1251(const unsigned char *mic, unsigned char *p, int len); +static void win8662mic(const unsigned char *l, unsigned char *p, int len); +static void mic2win866(const unsigned char *mic, unsigned char *p, int len); Datum koi8r_to_mic(PG_FUNCTION_ARGS) @@ -401,7 +401,7 @@ win1251_to_iso(PG_FUNCTION_ARGS) buf = palloc(len * ENCODING_GROWTH_RATE); win12512mic(src, buf, len); - mic2win1251(buf, dest, strlen((char *) buf)); + mic2iso(buf, dest, strlen((char *) buf)); pfree(buf); PG_RETURN_VOID(); @@ -441,7 +441,7 @@ win866_to_iso(PG_FUNCTION_ARGS) buf = palloc(len * ENCODING_GROWTH_RATE); win8662mic(src, buf, len); - mic2win866(buf, dest, strlen((char *) buf)); + mic2iso(buf, dest, strlen((char *) buf)); pfree(buf); PG_RETURN_VOID(); @@ -460,23 +460,23 @@ win866_to_iso(PG_FUNCTION_ARGS) /* koi8r2mic: KOI8-R to Mule internal code */ static void -koi8r2mic(unsigned char *l, unsigned char *p, int len) +koi8r2mic(const unsigned char *l, unsigned char *p, int len) { - latin2mic(l, p, len, LC_KOI8_R); + latin2mic(l, p, len, LC_KOI8_R, PG_KOI8R); } /* mic2koi8r: Mule internal code to KOI8-R */ static void -mic2koi8r(unsigned char *mic, unsigned char *p, int len) +mic2koi8r(const unsigned char *mic, unsigned char *p, int len) { - mic2latin(mic, p, len, LC_KOI8_R); + mic2latin(mic, p, len, LC_KOI8_R, PG_KOI8R); } /* iso2mic: ISO-8859-5 to Mule internal code */ static void -iso2mic(unsigned char *l, unsigned char *p, int len) +iso2mic(const unsigned char *l, unsigned char *p, int len) { - static unsigned char iso2koi[] = { + static const unsigned char iso2koi[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -495,14 +495,14 @@ iso2mic(unsigned char *l, unsigned char *p, int len) 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; - latin2mic_with_table(l, p, len, LC_KOI8_R, iso2koi); + latin2mic_with_table(l, p, len, LC_KOI8_R, PG_ISO_8859_5, iso2koi); } /* mic2iso: Mule internal code to ISO8859-5 */ static void -mic2iso(unsigned char *mic, unsigned char *p, int len) +mic2iso(const unsigned char *mic, unsigned char *p, int len) { - static unsigned char koi2iso[] = { + static const unsigned char koi2iso[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -521,14 +521,14 @@ mic2iso(unsigned char *mic, unsigned char *p, int len) 0xcc, 0xcb, 0xb7, 0xc8, 0xcd, 0xc9, 0xc7, 0xca }; - mic2latin_with_table(mic, p, len, LC_KOI8_R, koi2iso); + mic2latin_with_table(mic, p, len, LC_KOI8_R, PG_ISO_8859_5, koi2iso); } /* win2mic: CP1251 to Mule internal code */ static void -win12512mic(unsigned char *l, unsigned char *p, int len) +win12512mic(const unsigned char *l, unsigned char *p, int len) { - static unsigned char win2koi[] = { + static const unsigned char win2koi[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -547,14 +547,14 @@ win12512mic(unsigned char *l, unsigned char *p, int len) 0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1 }; - latin2mic_with_table(l, p, len, LC_KOI8_R, win2koi); + latin2mic_with_table(l, p, len, LC_KOI8_R, PG_WIN1251, win2koi); } /* mic2win: Mule internal code to CP1251 */ static void -mic2win1251(unsigned char *mic, unsigned char *p, int len) +mic2win1251(const unsigned char *mic, unsigned char *p, int len) { - static unsigned char koi2win[] = { + static const unsigned char koi2win[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -573,14 +573,14 @@ mic2win1251(unsigned char *mic, unsigned char *p, int len) 0xdc, 0xdb, 0xc7, 0xd8, 0xdd, 0xd9, 0xd7, 0xda }; - mic2latin_with_table(mic, p, len, LC_KOI8_R, koi2win); + mic2latin_with_table(mic, p, len, LC_KOI8_R, PG_WIN1251, koi2win); } /* win8662mic: CP866 to Mule internal code */ static void -win8662mic(unsigned char *l, unsigned char *p, int len) +win8662mic(const unsigned char *l, unsigned char *p, int len) { - static unsigned char win8662koi[] = { + static const unsigned char win8662koi[] = { 0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe, @@ -599,14 +599,14 @@ win8662mic(unsigned char *l, unsigned char *p, int len) 0xb6, 0xa6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; - latin2mic_with_table(l, p, len, LC_KOI8_R, win8662koi); + latin2mic_with_table(l, p, len, LC_KOI8_R, PG_WIN866, win8662koi); } /* mic2win866: Mule internal code to CP866 */ static void -mic2win866(unsigned char *mic, unsigned char *p, int len) +mic2win866(const unsigned char *mic, unsigned char *p, int len) { - static unsigned char koi2win866[] = { + static const unsigned char koi2win866[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -625,5 +625,5 @@ mic2win866(unsigned char *mic, unsigned char *p, int len) 0x9c, 0x9b, 0x87, 0x98, 0x9d, 0x99, 0x97, 0x9a }; - mic2latin_with_table(mic, p, len, LC_KOI8_R, koi2win866); + mic2latin_with_table(mic, p, len, LC_KOI8_R, PG_WIN866, koi2win866); } diff --git a/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c b/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c index 8a057383849..9121ac41801 100644 --- a/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c +++ b/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c,v 1.13 2006/03/05 15:58:47 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c,v 1.14 2006/05/21 20:05:19 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -32,8 +32,8 @@ extern Datum mic_to_euc_cn(PG_FUNCTION_ARGS); * ---------- */ -static void euc_cn2mic(unsigned char *euc, unsigned char *p, int len); -static void mic2euc_cn(unsigned char *mic, unsigned char *p, int len); +static void euc_cn2mic(const unsigned char *euc, unsigned char *p, int len); +static void mic2euc_cn(const unsigned char *mic, unsigned char *p, int len); Datum euc_cn_to_mic(PG_FUNCTION_ARGS) @@ -71,23 +71,30 @@ mic_to_euc_cn(PG_FUNCTION_ARGS) * EUC_CN ---> MIC */ static void -euc_cn2mic(unsigned char *euc, unsigned char *p, int len) +euc_cn2mic(const unsigned char *euc, unsigned char *p, int len) { int c1; - while (len >= 0 && (c1 = *euc++)) + while (len > 0) { + c1 = *euc; if (IS_HIGHBIT_SET(c1)) { - len -= 2; + if (len < 2 || !IS_HIGHBIT_SET(euc[1])) + report_invalid_encoding(PG_EUC_CN, (const char *) euc, len); *p++ = LC_GB2312_80; *p++ = c1; - *p++ = *euc++; + *p++ = euc[1]; + euc += 2; + len -= 2; } else { /* should be ASCII */ - len--; + if (c1 == 0) + report_invalid_encoding(PG_EUC_CN, (const char *) euc, len); *p++ = c1; + euc++; + len--; } } *p = '\0'; @@ -97,26 +104,35 @@ euc_cn2mic(unsigned char *euc, unsigned char *p, int len) * MIC ---> EUC_CN */ static void -mic2euc_cn(unsigned char *mic, unsigned char *p, int len) +mic2euc_cn(const unsigned char *mic, unsigned char *p, int len) { int c1; - while (len >= 0 && (c1 = *mic)) + while (len > 0) { - len -= pg_mic_mblen(mic++); - - if (c1 == LC_GB2312_80) + c1 = *mic; + if (IS_HIGHBIT_SET(c1)) { + if (c1 != LC_GB2312_80) + report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_CN, + (const char *) mic, len); + if (len < 3 || !IS_HIGHBIT_SET(mic[1]) || !IS_HIGHBIT_SET(mic[2])) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); + mic++; *p++ = *mic++; *p++ = *mic++; - } - else if (IS_HIGHBIT_SET(c1)) - { /* cannot convert to EUC_CN! */ - mic--; - pg_print_bogus_char(&mic, &p); + len -= 3; } else - *p++ = c1; /* should be ASCII */ + { /* should be ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); + *p++ = c1; + mic++; + len--; + } } *p = '\0'; } diff --git a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c index 31fb7375003..372dda2c9fe 100644 --- a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c +++ b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c,v 1.15 2006/03/04 10:57:35 ishii Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c,v 1.16 2006/05/21 20:05:19 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -22,9 +22,6 @@ #define PGSJISALTCODE 0x81ac #define PGEUCALTCODE 0xa2ae -#define ISSJISHEAD(c) ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xfc)) -#define ISSJISTAIL(c) ((c >= 0x40 && c <= 0x7e) || (c >= 0x80 && c <= 0xfc)) - /* * conversion table between SJIS UDC (IBM kanji) and EUC_JP */ @@ -57,12 +54,12 @@ extern Datum mic_to_sjis(PG_FUNCTION_ARGS); * ---------- */ -static void sjis2mic(unsigned char *sjis, unsigned char *p, int len); -static void mic2sjis(unsigned char *mic, unsigned char *p, int len); -static void euc_jp2mic(unsigned char *euc, unsigned char *p, int len); -static void mic2euc_jp(unsigned char *mic, unsigned char *p, int len); -static void euc_jp2sjis(unsigned char *mic, unsigned char *p, int len); -static void sjis2euc_jp(unsigned char *mic, unsigned char *p, int len); +static void sjis2mic(const unsigned char *sjis, unsigned char *p, int len); +static void mic2sjis(const unsigned char *mic, unsigned char *p, int len); +static void euc_jp2mic(const unsigned char *euc, unsigned char *p, int len); +static void mic2euc_jp(const unsigned char *mic, unsigned char *p, int len); +static void euc_jp2sjis(const unsigned char *mic, unsigned char *p, int len); +static void sjis2euc_jp(const unsigned char *mic, unsigned char *p, int len); Datum euc_jp_to_sjis(PG_FUNCTION_ARGS) @@ -164,38 +161,34 @@ mic_to_sjis(PG_FUNCTION_ARGS) * SJIS ---> MIC */ static void -sjis2mic(unsigned char *sjis, unsigned char *p, int len) +sjis2mic(const unsigned char *sjis, unsigned char *p, int len) { int c1, c2, -/* Eiji Tokuya patched begin */ i, k, k2; -/* Eiji Tokuya patched end */ - while (len >= 0 && (c1 = *sjis++)) + while (len > 0) { + c1 = *sjis; if (c1 >= 0xa1 && c1 <= 0xdf) { /* JIS X0201 (1 byte kana) */ - len--; *p++ = LC_JISX0201K; *p++ = c1; + sjis++; + len--; } else if (IS_HIGHBIT_SET(c1)) { /* * JIS X0208, X0212, user defined extended characters */ - c2 = *sjis++; - if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2)) - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("invalid byte sequence for encoding \"SJIS\": 0x%02x%02x", - c1, c2))); + if (len < 2 || !ISSJISHEAD(c1) || !ISSJISTAIL(sjis[1])) + report_invalid_encoding(PG_SJIS, (const char *) sjis, len); + c2 = sjis[1]; k = (c1 << 8) + c2; -/* Eiji Tokuya patched begin */ if (k >= 0xed40 && k < 0xf040) { /* NEC selection IBM kanji */ @@ -214,19 +207,15 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len) } if (k < 0xeb3f) -/* Eiji Tokuya patched end */ { /* JIS X0208 */ - len -= 2; *p++ = LC_JISX0208; *p++ = ((c1 & 0x3f) << 1) + 0x9f + (c2 > 0x9e); *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80); } -/* Eiji Tokuya patched begin */ else if ((k >= 0xeb40 && k < 0xf040) || (k >= 0xfc4c && k <= 0xfcfc)) { /* NEC selection IBM kanji - Other undecided justice */ -/* Eiji Tokuya patched end */ *p++ = LC_JISX0208; *p++ = PGEUCALTCODE >> 8; *p++ = PGEUCALTCODE & 0xff; @@ -237,7 +226,6 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len) * UDC1 mapping to X0208 85 ku - 94 ku JIS code 0x7521 - * 0x7e7e EUC 0xf5a1 - 0xfefe */ - len -= 2; *p++ = LC_JISX0208; c1 -= 0x6f; *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e); @@ -249,7 +237,6 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len) * UDC2 mapping to X0212 85 ku - 94 ku JIS code 0x7521 - * 0x7e7e EUC 0x8ff5a1 - 0x8ffefe */ - len -= 2; *p++ = LC_JISX0212; c1 -= 0x74; *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e); @@ -259,9 +246,7 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len) { /* * mapping IBM kanji to X0208 and X0212 - * */ - len -= 2; for (i = 0;; i++) { k2 = ibmkanji[i].sjis; @@ -285,11 +270,16 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len) } } } + sjis += 2; + len -= 2; } else { /* should be ASCII */ - len--; + if (c1 == 0) + report_invalid_encoding(PG_SJIS, (const char *) sjis, len); *p++ = c1; + sjis++; + len--; } } *p = '\0'; @@ -299,22 +289,37 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len) * MIC ---> SJIS */ static void -mic2sjis(unsigned char *mic, unsigned char *p, int len) +mic2sjis(const unsigned char *mic, unsigned char *p, int len) { int c1, c2, - k; + k, + l; - while (len >= 0 && (c1 = *mic)) + while (len > 0) { - len -= pg_mic_mblen(mic++); - + c1 = *mic; + if (!IS_HIGHBIT_SET(c1)) + { + /* ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); + *p++ = c1; + mic++; + len--; + continue; + } + l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len); + if (l < 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); if (c1 == LC_JISX0201K) - *p++ = *mic++; + *p++ = mic[1]; else if (c1 == LC_JISX0208) { - c1 = *mic++; - c2 = *mic++; + c1 = mic[1]; + c2 = mic[2]; k = (c1 << 8) | (c2 & 0xff); if (k >= 0xf5a1) { @@ -331,8 +336,8 @@ mic2sjis(unsigned char *mic, unsigned char *p, int len) int i, k2; - c1 = *mic++; - c2 = *mic++; + c1 = mic[1]; + c2 = mic[2]; k = c1 << 8 | c2; if (k >= 0xf5a1) { @@ -363,14 +368,11 @@ mic2sjis(unsigned char *mic, unsigned char *p, int len) } } } - else if (IS_HIGHBIT_SET(c1)) - { - /* cannot convert to SJIS! */ - *p++ = PGSJISALTCODE >> 8; - *p++ = PGSJISALTCODE & 0xff; - } else - *p++ = c1; /* should be ASCII */ + report_untranslatable_char(PG_MULE_INTERNAL, PG_SJIS, + (const char *) mic, len); + mic += l; + len -= l; } *p = '\0'; } @@ -379,37 +381,48 @@ mic2sjis(unsigned char *mic, unsigned char *p, int len) * EUC_JP ---> MIC */ static void -euc_jp2mic(unsigned char *euc, unsigned char *p, int len) +euc_jp2mic(const unsigned char *euc, unsigned char *p, int len) { int c1; + int l; - while (len >= 0 && (c1 = *euc++)) + while (len > 0) { + c1 = *euc; + if (!IS_HIGHBIT_SET(c1)) + { + /* ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_EUC_JP, + (const char *) euc, len); + *p++ = c1; + euc++; + len--; + continue; + } + l = pg_encoding_verifymb(PG_EUC_JP, (const char *) euc, len); + if (l < 0) + report_invalid_encoding(PG_EUC_JP, + (const char *) euc, len); if (c1 == SS2) { /* 1 byte kana? */ - len -= 2; *p++ = LC_JISX0201K; - *p++ = *euc++; + *p++ = euc[1]; } else if (c1 == SS3) { /* JIS X0212 kanji? */ - len -= 3; *p++ = LC_JISX0212; - *p++ = *euc++; - *p++ = *euc++; + *p++ = euc[1]; + *p++ = euc[2]; } - else if (c1 & 0x80) + else { /* kanji? */ - len -= 2; *p++ = LC_JISX0208; *p++ = c1; - *p++ = *euc++; - } - else - { /* should be ASCII */ - len--; - *p++ = c1; + *p++ = euc[1]; } + euc += l; + len -= l; } *p = '\0'; } @@ -418,37 +431,50 @@ euc_jp2mic(unsigned char *euc, unsigned char *p, int len) * MIC ---> EUC_JP */ static void -mic2euc_jp(unsigned char *mic, unsigned char *p, int len) +mic2euc_jp(const unsigned char *mic, unsigned char *p, int len) { int c1; + int l; - while (len >= 0 && (c1 = *mic)) + while (len > 0) { - len -= pg_mic_mblen(mic++); - + c1 = *mic; + if (!IS_HIGHBIT_SET(c1)) + { + /* ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); + *p++ = c1; + mic++; + len--; + continue; + } + l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len); + if (l < 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); if (c1 == LC_JISX0201K) { *p++ = SS2; - *p++ = *mic++; + *p++ = mic[1]; } else if (c1 == LC_JISX0212) { *p++ = SS3; - *p++ = *mic++; - *p++ = *mic++; + *p++ = mic[1]; + *p++ = mic[2]; } else if (c1 == LC_JISX0208) { - *p++ = *mic++; - *p++ = *mic++; - } - else if (IS_HIGHBIT_SET(c1)) - { /* cannot convert to EUC_JP! */ - mic--; - pg_print_bogus_char(&mic, &p); + *p++ = mic[1]; + *p++ = mic[2]; } else - *p++ = c1; /* should be ASCII */ + report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_JP, + (const char *) mic, len); + mic += l; + len -= l; } *p = '\0'; } @@ -457,30 +483,41 @@ mic2euc_jp(unsigned char *mic, unsigned char *p, int len) * EUC_JP -> SJIS */ static void -euc_jp2sjis(unsigned char *euc, unsigned char *p, int len) +euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len) { int c1, c2, k; - unsigned char *euc_end = euc + len; + int l; - while (euc_end >= euc && (c1 = *euc++)) + while (len > 0) { - if (c1 < 0x80) + c1 = *euc; + if (!IS_HIGHBIT_SET(c1)) { - /* should be ASCII */ + /* ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_EUC_JP, + (const char *) euc, len); *p++ = c1; + euc++; + len--; + continue; } - else if (c1 == SS2) + l = pg_encoding_verifymb(PG_EUC_JP, (const char *) euc, len); + if (l < 0) + report_invalid_encoding(PG_EUC_JP, + (const char *) euc, len); + if (c1 == SS2) { /* hankaku kana? */ - *p++ = *euc++; + *p++ = euc[1]; } else if (c1 == SS3) { /* JIS X0212 kanji? */ - c1 = *euc++; - c2 = *euc++; + c1 = euc[1]; + c2 = euc[2]; k = c1 << 8 | c2; if (k >= 0xf5a1) { @@ -517,7 +554,7 @@ euc_jp2sjis(unsigned char *euc, unsigned char *p, int len) else { /* JIS X0208 kanji? */ - c2 = *euc++; + c2 = euc[1]; k = (c1 << 8) | (c2 & 0xff); if (k >= 0xf5a1) { @@ -529,6 +566,8 @@ euc_jp2sjis(unsigned char *euc, unsigned char *p, int len) *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1); *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2); } + euc += l; + len -= l; } *p = '\0'; } @@ -537,23 +576,34 @@ euc_jp2sjis(unsigned char *euc, unsigned char *p, int len) * SJIS ---> EUC_JP */ static void -sjis2euc_jp(unsigned char *sjis, unsigned char *p, int len) +sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len) { int c1, c2, i, k, k2; - unsigned char *sjis_end = sjis + len; + int l; - while (sjis_end >= sjis && (c1 = *sjis++)) + while (len > 0) { - if (c1 < 0x80) + c1 = *sjis; + if (!IS_HIGHBIT_SET(c1)) { - /* should be ASCII */ + /* ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_SJIS, + (const char *) sjis, len); *p++ = c1; + sjis++; + len--; + continue; } - else if (c1 >= 0xa1 && c1 <= 0xdf) + l = pg_encoding_verifymb(PG_SJIS, (const char *) sjis, len); + if (l < 0) + report_invalid_encoding(PG_SJIS, + (const char *) sjis, len); + if (c1 >= 0xa1 && c1 <= 0xdf) { /* JIS X0201 (1 byte kana) */ *p++ = SS2; @@ -564,12 +614,7 @@ sjis2euc_jp(unsigned char *sjis, unsigned char *p, int len) /* * JIS X0208, X0212, user defined extended characters */ - c2 = *sjis++; - if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2)) - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("invalid byte sequence for encoding \"SJIS\": 0x%02x%02x", - c1, c2))); + c2 = sjis[1]; k = (c1 << 8) + c2; if (k >= 0xed40 && k < 0xf040) { @@ -650,6 +695,8 @@ sjis2euc_jp(unsigned char *sjis, unsigned char *p, int len) } } } + sjis += l; + len -= l; } *p = '\0'; } diff --git a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c index ff08a4f5372..a4248039085 100644 --- a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c +++ b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c,v 1.13 2006/03/05 15:58:47 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c,v 1.14 2006/05/21 20:05:19 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -32,8 +32,8 @@ extern Datum mic_to_euc_kr(PG_FUNCTION_ARGS); * ---------- */ -static void euc_kr2mic(unsigned char *euc, unsigned char *p, int len); -static void mic2euc_kr(unsigned char *mic, unsigned char *p, int len); +static void euc_kr2mic(const unsigned char *euc, unsigned char *p, int len); +static void mic2euc_kr(const unsigned char *mic, unsigned char *p, int len); Datum euc_kr_to_mic(PG_FUNCTION_ARGS) @@ -71,23 +71,34 @@ mic_to_euc_kr(PG_FUNCTION_ARGS) * EUC_KR ---> MIC */ static void -euc_kr2mic(unsigned char *euc, unsigned char *p, int len) +euc_kr2mic(const unsigned char *euc, unsigned char *p, int len) { int c1; + int l; - while (len >= 0 && (c1 = *euc++)) + while (len > 0) { + c1 = *euc; if (IS_HIGHBIT_SET(c1)) { - len -= 2; + l = pg_encoding_verifymb(PG_EUC_KR, (const char *) euc, len); + if (l != 2) + report_invalid_encoding(PG_EUC_KR, + (const char *) euc, len); *p++ = LC_KS5601; *p++ = c1; - *p++ = *euc++; + *p++ = euc[1]; + euc += 2; + len -= 2; } else { /* should be ASCII */ - len--; + if (c1 == 0) + report_invalid_encoding(PG_EUC_KR, + (const char *) euc, len); *p++ = c1; + euc++; + len--; } } *p = '\0'; @@ -97,26 +108,39 @@ euc_kr2mic(unsigned char *euc, unsigned char *p, int len) * MIC ---> EUC_KR */ static void -mic2euc_kr(unsigned char *mic, unsigned char *p, int len) +mic2euc_kr(const unsigned char *mic, unsigned char *p, int len) { int c1; + int l; - while (len >= 0 && (c1 = *mic)) + while (len > 0) { - len -= pg_mic_mblen(mic++); - - if (c1 == LC_KS5601) + c1 = *mic; + if (!IS_HIGHBIT_SET(c1)) { - *p++ = *mic++; - *p++ = *mic++; + /* ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); + *p++ = c1; + mic++; + len--; + continue; } - else if (IS_HIGHBIT_SET(c1)) - { /* cannot convert to EUC_KR! */ - mic--; - pg_print_bogus_char(&mic, &p); + l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len); + if (l < 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); + if (c1 == LC_KS5601) + { + *p++ = mic[1]; + *p++ = mic[2]; } else - *p++ = c1; /* should be ASCII */ + report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_KR, + (const char *) mic, len); + mic += l; + len -= l; } *p = '\0'; } diff --git a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c index 501dab46537..28ca458f7cb 100644 --- a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c +++ b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c,v 1.13 2006/03/05 15:58:47 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c,v 1.14 2006/05/21 20:05:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -42,10 +42,10 @@ extern Datum mic_to_big5(PG_FUNCTION_ARGS); * ---------- */ -static void big52mic(unsigned char *big5, unsigned char *p, int len); -static void mic2big5(unsigned char *mic, unsigned char *p, int len); -static void euc_tw2mic(unsigned char *euc, unsigned char *p, int len); -static void mic2euc_tw(unsigned char *mic, unsigned char *p, int len); +static void big52mic(const unsigned char *big5, unsigned char *p, int len); +static void mic2big5(const unsigned char *mic, unsigned char *p, int len); +static void euc_tw2mic(const unsigned char *euc, unsigned char *p, int len); +static void mic2euc_tw(const unsigned char *mic, unsigned char *p, int len); Datum euc_tw_to_big5(PG_FUNCTION_ARGS) @@ -114,7 +114,7 @@ mic_to_euc_tw(PG_FUNCTION_ARGS) Assert(PG_GETARG_INT32(1) == PG_EUC_TW); Assert(len >= 0); - mic2big5(src, dest, len); + mic2euc_tw(src, dest, len); PG_RETURN_VOID(); } @@ -155,39 +155,52 @@ mic_to_big5(PG_FUNCTION_ARGS) * EUC_TW ---> MIC */ static void -euc_tw2mic(unsigned char *euc, unsigned char *p, int len) +euc_tw2mic(const unsigned char *euc, unsigned char *p, int len) { int c1; + int l; - while (len >= 0 && (c1 = *euc++)) + while (len > 0) { - if (c1 == SS2) + c1 = *euc; + if (IS_HIGHBIT_SET(c1)) { - len -= 4; - c1 = *euc++; /* plane No. */ - if (c1 == 0xa1) - *p++ = LC_CNS11643_1; - else if (c1 == 0xa2) - *p++ = LC_CNS11643_2; - else + l = pg_encoding_verifymb(PG_EUC_TW, (const char *) euc, len); + if (l < 0) + report_invalid_encoding(PG_EUC_TW, + (const char *) euc, len); + if (c1 == SS2) { - *p++ = 0x9d; /* LCPRV2 */ - *p++ = 0xa3 - c1 + LC_CNS11643_3; + c1 = euc[1]; /* plane No. */ + if (c1 == 0xa1) + *p++ = LC_CNS11643_1; + else if (c1 == 0xa2) + *p++ = LC_CNS11643_2; + else + { + *p++ = 0x9d; /* LCPRV2 */ + *p++ = c1 - 0xa3 + LC_CNS11643_3; + } + *p++ = euc[2]; + *p++ = euc[3]; } - *p++ = *euc++; - *p++ = *euc++; - } - else if (IS_HIGHBIT_SET(c1)) - { /* CNS11643-1 */ - len -= 2; - *p++ = LC_CNS11643_1; - *p++ = c1; - *p++ = *euc++; + else + { /* CNS11643-1 */ + *p++ = LC_CNS11643_1; + *p++ = c1; + *p++ = euc[1]; + } + euc += l; + len -= l; } else { /* should be ASCII */ - len--; + if (c1 == 0) + report_invalid_encoding(PG_EUC_TW, + (const char *) euc, len); *p++ = c1; + euc++; + len--; } } *p = '\0'; @@ -197,40 +210,54 @@ euc_tw2mic(unsigned char *euc, unsigned char *p, int len) * MIC ---> EUC_TW */ static void -mic2euc_tw(unsigned char *mic, unsigned char *p, int len) +mic2euc_tw(const unsigned char *mic, unsigned char *p, int len) { int c1; + int l; - while (len >= 0 && (c1 = *mic)) + while (len > 0) { - len -= pg_mic_mblen(mic++); - + c1 = *mic; + if (!IS_HIGHBIT_SET(c1)) + { + /* ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); + *p++ = c1; + mic++; + len--; + continue; + } + l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len); + if (l < 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); if (c1 == LC_CNS11643_1) { - *p++ = *mic++; - *p++ = *mic++; + *p++ = mic[1]; + *p++ = mic[2]; } else if (c1 == LC_CNS11643_2) { *p++ = SS2; *p++ = 0xa2; - *p++ = *mic++; - *p++ = *mic++; + *p++ = mic[1]; + *p++ = mic[2]; } - else if (c1 == 0x9d) + else if (c1 == 0x9d && + mic[1] >= LC_CNS11643_3 && mic[1] <= LC_CNS11643_7) { /* LCPRV2? */ *p++ = SS2; - *p++ = *mic++ - LC_CNS11643_3 + 0xa3; - *p++ = *mic++; - *p++ = *mic++; - } - else if (IS_HIGHBIT_SET(c1)) - { /* cannot convert to EUC_TW! */ - mic--; - pg_print_bogus_char(&mic, &p); + *p++ = mic[1] - LC_CNS11643_3 + 0xa3; + *p++ = mic[2]; + *p++ = mic[3]; } else - *p++ = c1; /* should be ASCII */ + report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_TW, + (const char *) mic, len); + mic += l; + len -= l; } *p = '\0'; } @@ -239,52 +266,49 @@ mic2euc_tw(unsigned char *mic, unsigned char *p, int len) * Big5 ---> MIC */ static void -big52mic(unsigned char *big5, unsigned char *p, int len) +big52mic(const unsigned char *big5, unsigned char *p, int len) { unsigned short c1; unsigned short big5buf, cnsBuf; unsigned char lc; - char bogusBuf[3]; - int i; + int l; - while (len >= 0 && (c1 = *big5++)) + while (len > 0) { + c1 = *big5; if (!IS_HIGHBIT_SET(c1)) - { /* ASCII */ - len--; + { + /* ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_BIG5, + (const char *) big5, len); *p++ = c1; + big5++; + len--; + continue; } - else + l = pg_encoding_verifymb(PG_BIG5, (const char *) big5, len); + if (l < 0) + report_invalid_encoding(PG_BIG5, + (const char *) big5, len); + big5buf = (c1 << 8) | big5[1]; + cnsBuf = BIG5toCNS(big5buf, &lc); + if (lc != 0) { - len -= 2; - big5buf = c1 << 8; - c1 = *big5++; - big5buf |= c1; - cnsBuf = BIG5toCNS(big5buf, &lc); - if (lc != 0) + if (lc == LC_CNS11643_3 || lc == LC_CNS11643_4) { - if (lc == LC_CNS11643_3 || lc == LC_CNS11643_4) - { - *p++ = 0x9d; /* LCPRV2 */ - } - *p++ = lc; /* Plane No. */ - *p++ = (cnsBuf >> 8) & 0x00ff; - *p++ = cnsBuf & 0x00ff; - } - else - { /* cannot convert */ - big5 -= 2; - *p++ = '('; - for (i = 0; i < 2; i++) - { - sprintf(bogusBuf, "%02x", *big5++); - *p++ = bogusBuf[0]; - *p++ = bogusBuf[1]; - } - *p++ = ')'; + *p++ = 0x9d; /* LCPRV2 */ } + *p++ = lc; /* Plane No. */ + *p++ = (cnsBuf >> 8) & 0x00ff; + *p++ = cnsBuf & 0x00ff; } + else + report_untranslatable_char(PG_BIG5, PG_MULE_INTERNAL, + (const char *) big5, len); + big5 += l; + len -= l; } *p = '\0'; } @@ -293,46 +317,55 @@ big52mic(unsigned char *big5, unsigned char *p, int len) * MIC ---> Big5 */ static void -mic2big5(unsigned char *mic, unsigned char *p, int len) +mic2big5(const unsigned char *mic, unsigned char *p, int len) { - int l; unsigned short c1; unsigned short big5buf, cnsBuf; + int l; - while (len >= 0 && (c1 = *mic)) + while (len > 0) { - l = pg_mic_mblen(mic++); - len -= l; - + c1 = *mic; + if (!IS_HIGHBIT_SET(c1)) + { + /* ASCII */ + if (c1 == 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); + *p++ = c1; + mic++; + len--; + continue; + } + l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len); + if (l < 0) + report_invalid_encoding(PG_MULE_INTERNAL, + (const char *) mic, len); /* 0x9d means LCPRV2 */ if (c1 == LC_CNS11643_1 || c1 == LC_CNS11643_2 || c1 == 0x9d) { if (c1 == 0x9d) { - c1 = *mic++; /* get plane no. */ - } - cnsBuf = (*mic++) << 8; - cnsBuf |= (*mic++) & 0x00ff; - big5buf = CNStoBIG5(cnsBuf, c1); - if (big5buf == 0) - { /* cannot convert to Big5! */ - mic -= l; - pg_print_bogus_char(&mic, &p); + c1 = mic[1]; /* get plane no. */ + cnsBuf = (mic[2] << 8) | mic[3]; } else { - *p++ = (big5buf >> 8) & 0x00ff; - *p++ = big5buf & 0x00ff; + cnsBuf = (mic[1] << 8) | mic[2]; } + big5buf = CNStoBIG5(cnsBuf, c1); + if (big5buf == 0) + report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5, + (const char *) mic, len); + *p++ = (big5buf >> 8) & 0x00ff; + *p++ = big5buf & 0x00ff; } - else if (!IS_HIGHBIT_SET(c1)) /* ASCII */ - *p++ = c1; else - { /* cannot convert to Big5! */ - mic--; - pg_print_bogus_char(&mic, &p); - } + report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5, + (const char *) mic, len); + mic += l; + len -= l; } *p = '\0'; } diff --git a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c index b85ebf49b92..5563b20ee21 100644 --- a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c +++ b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c,v 1.11 2006/03/05 15:58:47 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c,v 1.12 2006/05/21 20:05:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -42,10 +42,10 @@ extern Datum win1250_to_latin2(PG_FUNCTION_ARGS); * ---------- */ -static void latin22mic(unsigned char *l, unsigned char *p, int len); -static void mic2latin2(unsigned char *mic, unsigned char *p, int len); -static void win12502mic(unsigned char *l, unsigned char *p, int len); -static void mic2win1250(unsigned char *mic, unsigned char *p, int len); +static void latin22mic(const unsigned char *l, unsigned char *p, int len); +static void mic2latin2(const unsigned char *mic, unsigned char *p, int len); +static void win12502mic(const unsigned char *l, unsigned char *p, int len); +static void mic2win1250(const unsigned char *mic, unsigned char *p, int len); Datum latin2_to_mic(PG_FUNCTION_ARGS) @@ -152,14 +152,15 @@ win1250_to_latin2(PG_FUNCTION_ARGS) } static void -latin22mic(unsigned char *l, unsigned char *p, int len) +latin22mic(const unsigned char *l, unsigned char *p, int len) { - latin2mic(l, p, len, LC_ISO8859_2); + latin2mic(l, p, len, LC_ISO8859_2, PG_LATIN2); } + static void -mic2latin2(unsigned char *mic, unsigned char *p, int len) +mic2latin2(const unsigned char *mic, unsigned char *p, int len) { - mic2latin(mic, p, len, LC_ISO8859_2); + mic2latin(mic, p, len, LC_ISO8859_2, PG_LATIN2); } /*----------------------------------------------------------------- @@ -167,9 +168,9 @@ mic2latin2(unsigned char *mic, unsigned char *p, int len) * Microsoft's CP1250(windows-1250) *-----------------------------------------------------------------*/ static void -win12502mic(unsigned char *l, unsigned char *p, int len) +win12502mic(const unsigned char *l, unsigned char *p, int len) { - static unsigned char win1250_2_iso88592[] = { + static const unsigned char win1250_2_iso88592[] = { 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0xA9, 0x8B, 0xA6, 0xAB, 0xAE, 0xAC, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, @@ -188,12 +189,14 @@ win12502mic(unsigned char *l, unsigned char *p, int len) 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF }; - latin2mic_with_table(l, p, len, LC_ISO8859_2, win1250_2_iso88592); + latin2mic_with_table(l, p, len, LC_ISO8859_2, PG_WIN1250, + win1250_2_iso88592); } + static void -mic2win1250(unsigned char *mic, unsigned char *p, int len) +mic2win1250(const unsigned char *mic, unsigned char *p, int len) { - static unsigned char iso88592_2_win1250[] = { + static const unsigned char iso88592_2_win1250[] = { 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x00, 0x8B, 0x00, 0x00, 0x00, 0x00, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, @@ -212,5 +215,6 @@ mic2win1250(unsigned char *mic, unsigned char *p, int len) 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF }; - mic2latin_with_table(mic, p, len, LC_ISO8859_2, iso88592_2_win1250); + mic2latin_with_table(mic, p, len, LC_ISO8859_2, PG_WIN1250, + iso88592_2_win1250); } diff --git a/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c b/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c index adf72e265d0..14c220c7bb1 100644 --- a/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c +++ b/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c,v 1.11 2006/03/05 15:58:47 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c,v 1.12 2006/05/21 20:05:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -40,12 +40,12 @@ extern Datum mic_to_latin4(PG_FUNCTION_ARGS); * ---------- */ -static void latin12mic(unsigned char *l, unsigned char *p, int len); -static void mic2latin1(unsigned char *mic, unsigned char *p, int len); -static void latin32mic(unsigned char *l, unsigned char *p, int len); -static void mic2latin3(unsigned char *mic, unsigned char *p, int len); -static void latin42mic(unsigned char *l, unsigned char *p, int len); -static void mic2latin4(unsigned char *mic, unsigned char *p, int len); +static void latin12mic(const unsigned char *l, unsigned char *p, int len); +static void mic2latin1(const unsigned char *mic, unsigned char *p, int len); +static void latin32mic(const unsigned char *l, unsigned char *p, int len); +static void mic2latin3(const unsigned char *mic, unsigned char *p, int len); +static void latin42mic(const unsigned char *l, unsigned char *p, int len); +static void mic2latin4(const unsigned char *mic, unsigned char *p, int len); Datum latin1_to_mic(PG_FUNCTION_ARGS) @@ -144,32 +144,37 @@ mic_to_latin4(PG_FUNCTION_ARGS) } static void -latin12mic(unsigned char *l, unsigned char *p, int len) +latin12mic(const unsigned char *l, unsigned char *p, int len) { - latin2mic(l, p, len, LC_ISO8859_1); + latin2mic(l, p, len, LC_ISO8859_1, PG_LATIN1); } + static void -mic2latin1(unsigned char *mic, unsigned char *p, int len) +mic2latin1(const unsigned char *mic, unsigned char *p, int len) { - mic2latin(mic, p, len, LC_ISO8859_1); + mic2latin(mic, p, len, LC_ISO8859_1, PG_LATIN1); } + static void -latin32mic(unsigned char *l, unsigned char *p, int len) +latin32mic(const unsigned char *l, unsigned char *p, int len) { - latin2mic(l, p, len, LC_ISO8859_3); + latin2mic(l, p, len, LC_ISO8859_3, PG_LATIN3); } + static void -mic2latin3(unsigned char *mic, unsigned char *p, int len) +mic2latin3(const unsigned char *mic, unsigned char *p, int len) { - mic2latin(mic, p, len, LC_ISO8859_3); + mic2latin(mic, p, len, LC_ISO8859_3, PG_LATIN3); } + static void -latin42mic(unsigned char *l, unsigned char *p, int len) +latin42mic(const unsigned char *l, unsigned char *p, int len) { - latin2mic(l, p, len, LC_ISO8859_4); + latin2mic(l, p, len, LC_ISO8859_4, PG_LATIN4); } + static void -mic2latin4(unsigned char *mic, unsigned char *p, int len) +mic2latin4(const unsigned char *mic, unsigned char *p, int len) { - mic2latin(mic, p, len, LC_ISO8859_4); + mic2latin(mic, p, len, LC_ISO8859_4, PG_LATIN4); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c b/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c index 33bd4bc2c85..ce9639d40d6 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c,v 1.12 2006/03/05 15:58:47 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c,v 1.13 2006/05/21 20:05:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -43,6 +43,7 @@ ascii_to_utf8(PG_FUNCTION_ARGS) Assert(PG_GETARG_INT32(1) == PG_UTF8); Assert(len >= 0); + /* this looks wrong, but basically we're just rejecting high-bit-set */ pg_ascii2mic(src, dest, len); PG_RETURN_VOID(); @@ -59,6 +60,7 @@ utf8_to_ascii(PG_FUNCTION_ARGS) Assert(PG_GETARG_INT32(1) == PG_SQL_ASCII); Assert(len >= 0); + /* this looks wrong, but basically we're just rejecting high-bit-set */ pg_mic2ascii(src, dest, len); PG_RETURN_VOID(); diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c index 3ec1497cf48..00fd62c619b 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c,v 1.12 2006/03/05 15:58:47 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c,v 1.13 2006/05/21 20:05:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_big5(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapBIG5, - sizeof(ULmapBIG5) / sizeof(pg_utf_to_local), len); + sizeof(ULmapBIG5) / sizeof(pg_utf_to_local), PG_BIG5, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c index 3dd47960423..0854e0dff03 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c,v 1.14 2006/03/05 15:58:47 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c,v 1.15 2006/05/21 20:05:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -46,7 +46,7 @@ utf8_to_koi8r(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapKOI8R, - sizeof(ULmapKOI8R) / sizeof(pg_utf_to_local), len); + sizeof(ULmapKOI8R) / sizeof(pg_utf_to_local), PG_KOI8R, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c index 860445f0a0b..23a1a5060ad 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c,v 1.13 2006/03/05 15:58:47 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c,v 1.14 2006/05/21 20:05:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_euc_cn(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapEUC_CN, - sizeof(ULmapEUC_CN) / sizeof(pg_utf_to_local), len); + sizeof(ULmapEUC_CN) / sizeof(pg_utf_to_local), PG_EUC_CN, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c index 1662d79b0a7..11bcd7ebc2e 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c,v 1.13 2006/03/05 15:58:48 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c,v 1.14 2006/05/21 20:05:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_euc_jp(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapEUC_JP, - sizeof(ULmapEUC_JP) / sizeof(pg_utf_to_local), len); + sizeof(ULmapEUC_JP) / sizeof(pg_utf_to_local), PG_EUC_JP, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c index 2059ad9d8b6..689584312c2 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c,v 1.13 2006/03/05 15:58:48 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c,v 1.14 2006/05/21 20:05:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_euc_kr(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapEUC_KR, - sizeof(ULmapEUC_KR) / sizeof(pg_utf_to_local), len); + sizeof(ULmapEUC_KR) / sizeof(pg_utf_to_local), PG_EUC_KR, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c index f7141df8cef..a26139dd6af 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c,v 1.13 2006/03/05 15:58:48 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c,v 1.14 2006/05/21 20:05:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_euc_tw(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapEUC_TW, - sizeof(ULmapEUC_TW) / sizeof(pg_utf_to_local), len); + sizeof(ULmapEUC_TW) / sizeof(pg_utf_to_local), PG_EUC_TW, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c index 379dae606a6..f2587b08159 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c,v 1.13 2006/03/05 15:58:48 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c,v 1.14 2006/05/21 20:05:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_gb18030(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapGB18030, - sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), len); + sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), PG_GB18030, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c index f52004f185c..be7a283f916 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c,v 1.12 2006/03/05 15:58:48 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c,v 1.13 2006/05/21 20:05:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_gbk(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapGBK, - sizeof(ULmapGBK) / sizeof(pg_utf_to_local), len); + sizeof(ULmapGBK) / sizeof(pg_utf_to_local), PG_GBK, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c index 8a86c3524e4..6de77c14aa1 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c,v 1.18 2006/03/05 15:58:48 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c,v 1.19 2006/05/21 20:05:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -153,7 +153,7 @@ utf8_to_iso8859(PG_FUNCTION_ARGS) { if (encoding == maps[i].encoding) { - UtfToLocal(src, dest, maps[i].map2, maps[i].size2, len); + UtfToLocal(src, dest, maps[i].map2, maps[i].size2, encoding, len); PG_RETURN_VOID(); } } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c index 16861376d51..038f6781141 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c,v 1.15 2006/03/05 15:58:48 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c,v 1.16 2006/05/21 20:05:20 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -44,8 +44,11 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS) Assert(PG_GETARG_INT32(1) == PG_UTF8); Assert(len >= 0); - while (len-- > 0 && (c = *src++)) + while (len > 0) { + c = *src; + if (c == 0) + report_invalid_encoding(PG_LATIN1, (const char *) src, len); if (!IS_HIGHBIT_SET(c)) *dest++ = c; else @@ -53,6 +56,8 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS) *dest++ = (c >> 6) | 0xc0; *dest++ = (c & 0x003f) | HIGHBIT; } + src++; + len--; } *dest = '\0'; @@ -66,32 +71,44 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS) unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); unsigned short c, - c1, - c2; + c1; Assert(PG_GETARG_INT32(0) == PG_UTF8); Assert(PG_GETARG_INT32(1) == PG_LATIN1); Assert(len >= 0); - while (len >= 0 && (c = *src++)) + while (len > 0) { - if ((c & 0xe0) == 0xc0) + c = *src; + if (c == 0) + report_invalid_encoding(PG_UTF8, (const char *) src, len); + /* fast path for ASCII-subset characters */ + if (!IS_HIGHBIT_SET(c)) { - c1 = c & 0x1f; - c2 = *src++ & 0x3f; - *dest = c1 << 6; - *dest++ |= c2; - len -= 2; + *dest++ = c; + src++; + len--; } - else if ((c & 0xe0) == 0xe0) - ereport(WARNING, - (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), - errmsg("ignoring unconvertible UTF-8 character 0x%04x", - c))); else { - *dest++ = c; - len--; + int l = pg_utf_mblen(src); + + if (l > len || !pg_utf8_islegal(src, l)) + report_invalid_encoding(PG_UTF8, (const char *) src, len); + if (l != 2) + report_untranslatable_char(PG_UTF8, PG_LATIN1, + (const char *) src, len); + c1 = src[1] & 0x3f; + c = ((c & 0x1f) << 6) | c1; + if (c >= 0x80 && c <= 0xff) + { + *dest++ = (unsigned char) c; + src += 2; + len -= 2; + } + else + report_untranslatable_char(PG_UTF8, PG_LATIN1, + (const char *) src, len); } } *dest = '\0'; diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c index 5896faf53e1..7223ecef8bd 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c,v 1.13 2006/03/05 15:58:48 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c,v 1.14 2006/05/21 20:05:21 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_johab(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapJOHAB, - sizeof(ULmapJOHAB) / sizeof(pg_utf_to_local), len); + sizeof(ULmapJOHAB) / sizeof(pg_utf_to_local), PG_JOHAB, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c index decdc5fd215..12f9f43b5e7 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c,v 1.12 2006/03/05 15:58:48 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c,v 1.13 2006/05/21 20:05:21 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_sjis(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapSJIS, - sizeof(ULmapSJIS) / sizeof(pg_utf_to_local), len); + sizeof(ULmapSJIS) / sizeof(pg_utf_to_local), PG_SJIS, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c index 5689dc54337..860b475b8a2 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c,v 1.12 2006/03/05 15:58:48 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c,v 1.13 2006/05/21 20:05:21 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,7 +62,7 @@ utf8_to_uhc(PG_FUNCTION_ARGS) Assert(len >= 0); UtfToLocal(src, dest, ULmapUHC, - sizeof(ULmapUHC) / sizeof(pg_utf_to_local), len); + sizeof(ULmapUHC) / sizeof(pg_utf_to_local), PG_UHC, len); PG_RETURN_VOID(); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c index ad7f31996df..932b164fac7 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c,v 1.2 2006/03/05 15:58:48 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c,v 1.3 2006/05/21 20:05:21 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -143,7 +143,7 @@ utf8_to_win(PG_FUNCTION_ARGS) { if (encoding == maps[i].encoding) { - UtfToLocal(src, dest, maps[i].map2, maps[i].size2, len); + UtfToLocal(src, dest, maps[i].map2, maps[i].size2, encoding, len); PG_RETURN_VOID(); } } diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index dc43b049c6e..935e4a8d189 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -4,7 +4,7 @@ * (currently mule internal code (mic) is used) * Tatsuo Ishii * - * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.55 2006/01/12 22:04:02 neilc Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/mbutils.c,v 1.56 2006/05/21 20:05:19 tgl Exp $ */ #include "postgres.h" @@ -362,8 +362,49 @@ pg_client_to_server(const char *s, int len) Assert(DatabaseEncoding); Assert(ClientEncoding); - if (ClientEncoding->encoding == DatabaseEncoding->encoding) + if (len <= 0) + return (char *) s; + + if (ClientEncoding->encoding == DatabaseEncoding->encoding || + ClientEncoding->encoding == PG_SQL_ASCII) + { + /* + * No conversion is needed, but we must still validate the data. + */ + (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false); + return (char *) s; + } + + if (DatabaseEncoding->encoding == PG_SQL_ASCII) + { + /* + * No conversion is possible, but we must still validate the data, + * because the client-side code might have done string escaping + * using the selected client_encoding. If the client encoding is + * ASCII-safe then we just do a straight validation under that + * encoding. For an ASCII-unsafe encoding we have a problem: + * we dare not pass such data to the parser but we have no way + * to convert it. We compromise by rejecting the data if it + * contains any non-ASCII characters. + */ + if (PG_VALID_BE_ENCODING(ClientEncoding->encoding)) + (void) pg_verify_mbstr(ClientEncoding->encoding, s, len, false); + else + { + int i; + + for (i = 0; i < len; i++) + { + if (s[i] == '\0' || IS_HIGHBIT_SET(s[i])) + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid byte value for encoding \"%s\": 0x%02x", + pg_enc2name_tbl[PG_SQL_ASCII].name, + (unsigned char) s[i]))); + } + } return (char *) s; + } return perform_default_encoding_conversion(s, len, true); } @@ -377,9 +418,14 @@ pg_server_to_client(const char *s, int len) Assert(DatabaseEncoding); Assert(ClientEncoding); - if (ClientEncoding->encoding == DatabaseEncoding->encoding) + if (len <= 0) return (char *) s; + if (ClientEncoding->encoding == DatabaseEncoding->encoding || + ClientEncoding->encoding == PG_SQL_ASCII || + DatabaseEncoding->encoding == PG_SQL_ASCII) + return (char *) s; /* assume data is valid */ + return perform_default_encoding_conversion(s, len, false); } @@ -398,9 +444,6 @@ perform_default_encoding_conversion(const char *src, int len, bool is_client_to_ dest_encoding; FmgrInfo *flinfo; - if (len <= 0) - return (char *) src; - if (is_client_to_server) { src_encoding = ClientEncoding->encoding; @@ -417,12 +460,6 @@ perform_default_encoding_conversion(const char *src, int len, bool is_client_to_ if (flinfo == NULL) return (char *) src; - if (src_encoding == dest_encoding) - return (char *) src; - - if (src_encoding == PG_SQL_ASCII || dest_encoding == PG_SQL_ASCII) - return (char *) src; - result = palloc(len * 4 + 1); FunctionCall5(flinfo, diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index d996b6c826b..0cd1d313109 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -1,7 +1,7 @@ /* * conversion functions between pg_wchar and multibyte streams. * Tatsuo Ishii - * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.54 2006/02/18 16:15:22 petere Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.55 2006/05/21 20:05:19 tgl Exp $ * * WIN1250 client encoding updated by Pavel Behal * @@ -19,17 +19,21 @@ /* * conversion to pg_wchar is done by "table driven." - * to add an encoding support, define mb2wchar_with_len(), mblen() + * to add an encoding support, define mb2wchar_with_len(), mblen(), dsplen() * for the particular encoding. Note that if the encoding is only * supported in the client, you don't need to define * mb2wchar_with_len() function (SJIS is the case). * + * These functions generally assume that their input is validly formed. + * The "verifier" functions, further down in the file, have to be more + * paranoid. We expect that mblen() does not need to examine more than + * the first byte of the character to discover the correct length. + * * Note: for the display output of psql to work properly, the return values - * of these functions must conform to the Unicode standard. In particular + * of the dsplen functions must conform to the Unicode standard. In particular * the NUL character is zero width and control characters are generally * width -1. It is recommended that non-ASCII encodings refer their ASCII - * subset to the ASCII routines to ensure consistancy. - * + * subset to the ASCII routines to ensure consistency. */ /* @@ -109,7 +113,7 @@ static int pg_euc2wchar_with_len return cnt; } -static int +static inline int pg_euc_mblen(const unsigned char *s) { int len; @@ -125,7 +129,7 @@ pg_euc_mblen(const unsigned char *s) return len; } -static int +static inline int pg_euc_dsplen(const unsigned char *s) { int len; @@ -316,7 +320,7 @@ pg_euctw_mblen(const unsigned char *s) else if (IS_HIGHBIT_SET(*s)) len = 2; else - len = pg_ascii_dsplen(s); + len = 1; return len; } @@ -409,7 +413,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) } /* - * returns the byte length of a UTF8 word pointed to by s + * returns the byte length of a UTF8 character pointed to by s */ int pg_utf_mblen(const unsigned char *s) @@ -680,7 +684,20 @@ pg_mule_mblen(const unsigned char *s) static int pg_mule_dsplen(const unsigned char *s) { - return pg_ascii_dsplen(s); /* XXX fix me! */ + int len; + + if (IS_LC1(*s)) + len = 1; + else if (IS_LCPRV1(*s)) + len = 1; + else if (IS_LC2(*s)) + len = 2; + else if (IS_LCPRV2(*s)) + len = 2; + else + len = 1; /* assume ASCII */ + + return len; } /* @@ -860,233 +877,646 @@ pg_gb18030_dsplen(const unsigned char *s) return len; } +/* + *------------------------------------------------------------------- + * multibyte sequence validators + * + * These functions accept "s", a pointer to the first byte of a string, + * and "len", the remaining length of the string. If there is a validly + * encoded character beginning at *s, return its length in bytes; else + * return -1. + * + * The functions can assume that len > 0 and that *s != '\0', but they must + * test for and reject zeroes in any additional bytes of a multibyte character. + * + * Note that this definition allows the function for a single-byte + * encoding to be just "return 1". + *------------------------------------------------------------------- + */ -pg_wchar_tbl pg_wchar_table[] = { - {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, 1}, /* 0; PG_SQL_ASCII */ - {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, 3}, /* 1; PG_EUC_JP */ - {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, 3}, /* 2; PG_EUC_CN */ - {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */ - {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */ - {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */ - {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 4}, /* 6; PG_UTF8 */ - {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 10; PG_LATIN3 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 11; PG_LATIN4 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 12; PG_LATIN5 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 13; PG_LATIN6 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 14; PG_LATIN7 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 15; PG_LATIN8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 16; PG_LATIN9 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 17; PG_LATIN10 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 18; PG_WIN1256 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 19; PG_WIN1258 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 20; PG_WIN874 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 21; PG_KOI8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 22; PG_WIN1251 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 22; PG_WIN1252 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 23; PG_WIN866 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 24; ISO-8859-5 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 25; ISO-8859-6 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 26; ISO-8859-7 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 27; ISO-8859-8 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 28; PG_WIN1250 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 29; PG_WIN1253 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 30; PG_WIN1254 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 31; PG_WIN1255 */ - {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 32; PG_WIN1257 */ - {0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 33; PG_SJIS */ - {0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 34; PG_BIG5 */ - {0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 35; PG_GBK */ - {0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 36; PG_UHC */ - {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 37; PG_GB18030 */ -}; +static int +pg_ascii_verifier(const unsigned char *s, int len) +{ + return 1; +} -/* returns the byte length of a word for mule internal code */ -int -pg_mic_mblen(const unsigned char *mbstr) +#define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe) + +static int +pg_eucjp_verifier(const unsigned char *s, int len) { - return pg_mule_mblen(mbstr); + int l; + unsigned char c1, c2; + + c1 = *s++; + + switch (c1) + { + case SS2: /* JIS X 0201 */ + l = 2; + if (l > len) + return -1; + c2 = *s++; + if (c2 < 0xa1 || c2 > 0xdf) + return -1; + break; + + case SS3: /* JIS X 0212 */ + l = 3; + if (l > len) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + break; + + default: + if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */ + { + l = 2; + if (l > len) + return -1; + if (!IS_EUC_RANGE_VALID(c1)) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + } + else /* must be ASCII */ + { + l = 1; + } + break; + } + + return l; } -/* - * Returns the byte length of a multibyte word. - */ -int -pg_encoding_mblen(int encoding, const char *mbstr) +static int +pg_euckr_verifier(const unsigned char *s, int len) { - Assert(PG_VALID_ENCODING(encoding)); + int l; + unsigned char c1, c2; - return ((encoding >= 0 && - encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? - ((*pg_wchar_table[encoding].mblen) ((const unsigned char *) mbstr)) : - ((*pg_wchar_table[PG_SQL_ASCII].mblen) ((const unsigned char *) mbstr))); + c1 = *s++; + + if (IS_HIGHBIT_SET(c1)) + { + l = 2; + if (l > len) + return -1; + if (!IS_EUC_RANGE_VALID(c1)) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + } + else /* must be ASCII */ + { + l = 1; + } + + return l; } -/* - * Returns the display length of a multibyte word. - */ -int -pg_encoding_dsplen(int encoding, const char *mbstr) +/* EUC-CN byte sequences are exactly same as EUC-KR */ +#define pg_euccn_verifier pg_euckr_verifier + +static int +pg_euctw_verifier(const unsigned char *s, int len) { - Assert(PG_VALID_ENCODING(encoding)); + int l; + unsigned char c1, c2; - return ((encoding >= 0 && - encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? - ((*pg_wchar_table[encoding].dsplen) ((const unsigned char *) mbstr)) : - ((*pg_wchar_table[PG_SQL_ASCII].dsplen) ((const unsigned char *) mbstr))); + c1 = *s++; + + switch (c1) + { + case SS2: /* CNS 11643 Plane 1-7 */ + l = 4; + if (l > len) + return -1; + c2 = *s++; + if (c2 < 0xa1 || c2 > 0xa7) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + break; + + case SS3: /* unused */ + return -1; + + default: + if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */ + { + l = 2; + if (l > len) + return -1; + /* no further range check on c1? */ + c2 = *s++; + if (!IS_EUC_RANGE_VALID(c2)) + return -1; + } + else /* must be ASCII */ + { + l = 1; + } + break; + } + return l; } -/* - * fetch maximum length of a char encoding - */ -int -pg_encoding_max_length(int encoding) +static int +pg_johab_verifier(const unsigned char *s, int len) { - Assert(PG_VALID_ENCODING(encoding)); + int l, mbl; + unsigned char c; - return pg_wchar_table[encoding].maxmblen; + l = mbl = pg_johab_mblen(s); + + if (len < l) + return -1; + + if (!IS_HIGHBIT_SET(*s)) + return mbl; + + while (--l > 0) + { + c = *++s; + if (!IS_EUC_RANGE_VALID(c)) + return -1; + } + return mbl; } -#ifndef FRONTEND +static int +pg_mule_verifier(const unsigned char *s, int len) +{ + int l, mbl; + unsigned char c; + + l = mbl = pg_mule_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + c = *++s; + if (!IS_HIGHBIT_SET(c)) + return -1; + } + return mbl; +} + +static int +pg_latin1_verifier(const unsigned char *s, int len) +{ + return 1; +} + +static int +pg_sjis_verifier(const unsigned char *s, int len) +{ + int l, mbl; + unsigned char c1, c2; + + l = mbl = pg_sjis_mblen(s); + + if (len < l) + return -1; + + if (l == 1) /* pg_sjis_mblen already verified it */ + return mbl; + + c1 = *s++; + c2 = *s; + if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2)) + return -1; + return mbl; +} + +static int +pg_big5_verifier(const unsigned char *s, int len) +{ + int l, mbl; + + l = mbl = pg_big5_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + if (*++s == '\0') + return -1; + } + + return mbl; +} + +static int +pg_gbk_verifier(const unsigned char *s, int len) +{ + int l, mbl; + + l = mbl = pg_gbk_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + if (*++s == '\0') + return -1; + } + + return mbl; +} +static int +pg_uhc_verifier(const unsigned char *s, int len) +{ + int l, mbl; + + l = mbl = pg_uhc_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + if (*++s == '\0') + return -1; + } + + return mbl; +} + +static int +pg_gb18030_verifier(const unsigned char *s, int len) +{ + int l, mbl; + + l = mbl = pg_gb18030_mblen(s); + + if (len < l) + return -1; + + while (--l > 0) + { + if (*++s == '\0') + return -1; + } + + return mbl; +} + +static int +pg_utf8_verifier(const unsigned char *s, int len) +{ + int l = pg_utf_mblen(s); + + if (len < l) + return -1; + + if (!pg_utf8_islegal(s, l)) + return -1; + + return l; +} + +/* + * Check for validity of a single UTF-8 encoded character + * + * This directly implements the rules in RFC3629. The bizarre-looking + * restrictions on the second byte are meant to ensure that there isn't + * more than one encoding of a given Unicode character point; that is, + * you may not use a longer-than-necessary byte sequence with high order + * zero bits to represent a character that would fit in fewer bytes. + * To do otherwise is to create security hazards (eg, create an apparent + * non-ASCII character that decodes to plain ASCII). + * + * length is assumed to have been obtained by pg_utf_mblen(), and the + * caller must have checked that that many bytes are present in the buffer. + */ bool pg_utf8_islegal(const unsigned char *source, int length) { unsigned char a; - const unsigned char *srcptr = source + length; switch (length) { default: + /* reject lengths 5 and 6 for now */ return false; - /* Everything else falls through when "true"... */ case 4: - if ((a = (*--srcptr)) < 0x80 || a > 0xBF) + a = source[3]; + if (a < 0x80 || a > 0xBF) return false; + /* FALL THRU */ case 3: - if ((a = (*--srcptr)) < 0x80 || a > 0xBF) + a = source[2]; + if (a < 0x80 || a > 0xBF) return false; + /* FALL THRU */ case 2: - if ((a = (*--srcptr)) > 0xBF) - return false; + a = source[1]; switch (*source) { - /* no fall-through in this inner switch */ case 0xE0: - if (a < 0xA0) + if (a < 0xA0 || a > 0xBF) return false; break; case 0xED: - if (a > 0x9F) + if (a < 0x80 || a > 0x9F) return false; break; case 0xF0: - if (a < 0x90) + if (a < 0x90 || a > 0xBF) return false; break; case 0xF4: - if (a > 0x8F) + if (a < 0x80 || a > 0x8F) return false; break; default: - if (a < 0x80) + if (a < 0x80 || a > 0xBF) return false; + break; } - + /* FALL THRU */ case 1: - if (*source >= 0x80 && *source < 0xC2) + a = *source; + if (a >= 0x80 && a < 0xC2) + return false; + if (a > 0xF4) return false; + break; } - if (*source > 0xF4) - return false; return true; } +/* + *------------------------------------------------------------------- + * encoding info table + *------------------------------------------------------------------- + */ +pg_wchar_tbl pg_wchar_table[] = { + {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* 0; PG_SQL_ASCII */ + {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* 1; PG_EUC_JP */ + {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 3}, /* 2; PG_EUC_CN */ + {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* 3; PG_EUC_KR */ + {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 3}, /* 4; PG_EUC_TW */ + {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* 5; PG_JOHAB */ + {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* 6; PG_UTF8 */ + {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 3}, /* 7; PG_MULE_INTERNAL */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 8; PG_LATIN1 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 9; PG_LATIN2 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 10; PG_LATIN3 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 11; PG_LATIN4 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 12; PG_LATIN5 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 13; PG_LATIN6 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 14; PG_LATIN7 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 15; PG_LATIN8 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 16; PG_LATIN9 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 17; PG_LATIN10 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 18; PG_WIN1256 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 19; PG_WIN1258 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 20; PG_WIN874 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 21; PG_KOI8 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 22; PG_WIN1251 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 22; PG_WIN1252 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 23; PG_WIN866 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 24; ISO-8859-5 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 25; ISO-8859-6 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 26; ISO-8859-7 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 27; ISO-8859-8 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 28; PG_WIN1250 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 29; PG_WIN1253 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 30; PG_WIN1254 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 31; PG_WIN1255 */ + {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 32; PG_WIN1257 */ + {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* 33; PG_SJIS */ + {0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* 34; PG_BIG5 */ + {0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* 35; PG_GBK */ + {0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* 36; PG_UHC */ + {0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 2} /* 37; PG_GB18030 */ +}; + +/* returns the byte length of a word for mule internal code */ +int +pg_mic_mblen(const unsigned char *mbstr) +{ + return pg_mule_mblen(mbstr); +} + +/* + * Returns the byte length of a multibyte character. + */ +int +pg_encoding_mblen(int encoding, const char *mbstr) +{ + Assert(PG_VALID_ENCODING(encoding)); + + return ((encoding >= 0 && + encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? + ((*pg_wchar_table[encoding].mblen) ((const unsigned char *) mbstr)) : + ((*pg_wchar_table[PG_SQL_ASCII].mblen) ((const unsigned char *) mbstr))); +} + +/* + * Returns the display length of a multibyte character. + */ +int +pg_encoding_dsplen(int encoding, const char *mbstr) +{ + Assert(PG_VALID_ENCODING(encoding)); + + return ((encoding >= 0 && + encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? + ((*pg_wchar_table[encoding].dsplen) ((const unsigned char *) mbstr)) : + ((*pg_wchar_table[PG_SQL_ASCII].dsplen) ((const unsigned char *) mbstr))); +} + +/* + * Verify the first multibyte character of the given string. + * Return its byte length if good, -1 if bad. (See comments above for + * full details of the mbverify API.) + */ +int +pg_encoding_verifymb(int encoding, const char *mbstr, int len) +{ + Assert(PG_VALID_ENCODING(encoding)); + + return ((encoding >= 0 && + encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ? + ((*pg_wchar_table[encoding].mbverify) ((const unsigned char *) mbstr, len)) : + ((*pg_wchar_table[PG_SQL_ASCII].mbverify) ((const unsigned char *) mbstr, len))); +} /* - * Verify mbstr to make sure that it has a valid character sequence. - * mbstr is not necessarily NULL terminated; length of mbstr is + * fetch maximum length of a given encoding + */ +int +pg_encoding_max_length(int encoding) +{ + Assert(PG_VALID_ENCODING(encoding)); + + return pg_wchar_table[encoding].maxmblen; +} + +#ifndef FRONTEND + +/* + * fetch maximum length of the encoding for the current database + */ +int +pg_database_encoding_max_length(void) +{ + return pg_wchar_table[GetDatabaseEncoding()].maxmblen; +} + +/* + * Verify mbstr to make sure that it is validly encoded in the current + * database encoding. Otherwise same as pg_verify_mbstr(). + */ +bool +pg_verifymbstr(const char *mbstr, int len, bool noError) +{ + return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError); +} + +/* + * Verify mbstr to make sure that it is validly encoded in the specified + * encoding. + * + * mbstr is not necessarily zero terminated; length of mbstr is * specified by len. * * If OK, return TRUE. If a problem is found, return FALSE when noError is * true; when noError is false, ereport() a descriptive message. */ bool -pg_verifymbstr(const char *mbstr, int len, bool noError) +pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError) { - int l; - int i; - int encoding; + mbverifier mbverify; + + Assert(PG_VALID_ENCODING(encoding)); + + /* + * In single-byte encodings, we need only reject nulls (\0). + */ + if (pg_encoding_max_length(encoding) <= 1) + { + const char *nullpos = memchr(mbstr, 0, len); - /* we do not need any check in single-byte encodings */ - if (pg_database_encoding_max_length() <= 1) - return true; + if (nullpos == NULL) + return true; + if (noError) + return false; + report_invalid_encoding(encoding, nullpos, 1); + } - encoding = GetDatabaseEncoding(); + /* fetch function pointer just once */ + mbverify = pg_wchar_table[encoding].mbverify; - while (len > 0 && *mbstr) + while (len > 0) { - l = pg_mblen(mbstr); + int l; - /* special UTF-8 check */ - if (encoding == PG_UTF8) + /* fast path for ASCII-subset characters */ + if (!IS_HIGHBIT_SET(*mbstr)) { - if (!pg_utf8_islegal((const unsigned char *) mbstr, l)) + if (*mbstr != '\0') { - if (noError) - return false; - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("invalid UTF-8 byte sequence detected near byte 0x%02x", - (unsigned char) *mbstr))); + mbstr++; + len--; + continue; } + if (noError) + return false; + report_invalid_encoding(encoding, mbstr, len); } - else - { - for (i = 1; i < l; i++) - { - /* - * we expect that every multibyte char consists of bytes - * having the 8th bit set - */ - if (i >= len || !IS_HIGHBIT_SET(mbstr[i])) - { - char buf[8 * 2 + 1]; - char *p = buf; - int j, - jlimit; - - if (noError) - return false; - - jlimit = Min(l, len); - jlimit = Min(jlimit, 8); /* prevent buffer overrun */ - for (j = 0; j < jlimit; j++) - p += sprintf(p, "%02x", (unsigned char) mbstr[j]); + l = (*mbverify) ((const unsigned char *) mbstr, len); - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("invalid byte sequence for encoding \"%s\": 0x%s", - GetDatabaseEncodingName(), buf))); - } - } + if (l < 0) + { + if (noError) + return false; + report_invalid_encoding(encoding, mbstr, len); } - len -= l; + mbstr += l; + len -= l; } return true; } /* - * fetch maximum length of a char encoding for the current database + * report_invalid_encoding: complain about invalid multibyte character + * + * note: len is remaining length of string, not length of character; + * len must be greater than zero, as we always examine the first byte. + */ +void +report_invalid_encoding(int encoding, const char *mbstr, int len) +{ + int l = pg_encoding_mblen(encoding, mbstr); + char buf[8 * 2 + 1]; + char *p = buf; + int j, + jlimit; + + jlimit = Min(l, len); + jlimit = Min(jlimit, 8); /* prevent buffer overrun */ + + for (j = 0; j < jlimit; j++) + p += sprintf(p, "%02x", (unsigned char) mbstr[j]); + + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid byte sequence for encoding \"%s\": 0x%s", + pg_enc2name_tbl[encoding].name, + buf))); +} + +/* + * report_untranslatable_char: complain about untranslatable character + * + * note: len is remaining length of string, not length of character; + * len must be greater than zero, as we always examine the first byte. */ -int -pg_database_encoding_max_length(void) +void +report_untranslatable_char(int src_encoding, int dest_encoding, + const char *mbstr, int len) { - return pg_wchar_table[GetDatabaseEncoding()].maxmblen; + int l = pg_encoding_mblen(src_encoding, mbstr); + char buf[8 * 2 + 1]; + char *p = buf; + int j, + jlimit; + + jlimit = Min(l, len); + jlimit = Min(jlimit, 8); /* prevent buffer overrun */ + + for (j = 0; j < jlimit; j++) + p += sprintf(p, "%02x", (unsigned char) mbstr[j]); + + ereport(ERROR, + (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), + errmsg("character 0x%s of encoding \"%s\" has no equivalent in \"%s\"", + buf, + pg_enc2name_tbl[src_encoding].name, + pg_enc2name_tbl[dest_encoding].name))); } #endif diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index d049f4ecfc3..1bb8042344d 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -1,4 +1,4 @@ -/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.67 2006/02/18 16:15:23 petere Exp $ */ +/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.68 2006/05/21 20:05:21 tgl Exp $ */ #ifndef PG_WCHAR_H #define PG_WCHAR_H @@ -24,10 +24,16 @@ typedef unsigned int pg_wchar; #define SS3 0x8f /* single shift 3 (JIS0212) */ /* + * SJIS validation macros + */ +#define ISSJISHEAD(c) (((c) >= 0x81 && (c) <= 0x9f) || ((c) >= 0xe0 && (c) <= 0xfc)) +#define ISSJISTAIL(c) (((c) >= 0x40 && (c) <= 0x7e) || ((c) >= 0x80 && (c) <= 0xfc)) + +/* * Leading byte types or leading prefix byte for MULE internal code. * See http://www.xemacs.org for more details. (there is a doc titled * "XEmacs Internals Manual", "MULE Character Sets and Encodings" - * section. + * section.) */ /* * Is a leading byte for "official" single byte encodings? @@ -64,7 +70,7 @@ typedef unsigned int pg_wchar; #define LC_ISO8859_8 0x88 /* Hebrew (not supported yet) */ #define LC_JISX0201K 0x89 /* Japanese 1 byte kana */ #define LC_JISX0201R 0x8a /* Japanese 1 byte Roman */ -/* Note that 0x8b seems to be unused in as of Emacs 20.7. +/* Note that 0x8b seems to be unused as of Emacs 20.7. * However, there might be a chance that 0x8b could be used * in later version of Emacs. */ @@ -135,13 +141,13 @@ typedef unsigned int pg_wchar; /* #define FREE 0xff free (unused) */ /* - * Encoding numeral identificators + * PostgreSQL encoding identifiers * * WARNING: the order of this table must be same as order * in the pg_enc2name[] (mb/encnames.c) array! * - * If you add some encoding don'y forget check - * PG_ENCODING_[BE|FE]_LAST macros. + * If you add some encoding don't forget to check + * PG_ENCODING_BE_LAST macro. * * The PG_SQL_ASCII is default encoding and must be = 0. */ @@ -208,8 +214,7 @@ typedef enum pg_enc #define PG_VALID_ENCODING(_enc) \ ((_enc) >= 0 && (_enc) < _PG_LAST_ENCODING_) -/* On FE are possible all encodings - */ +/* On FE are possible all encodings */ #define PG_VALID_FE_ENCODING(_enc) PG_VALID_ENCODING(_enc) /* @@ -249,18 +254,21 @@ extern const char *pg_encoding_to_char(int encoding); typedef int (*mb2wchar_with_len_converter) (const unsigned char *from, pg_wchar *to, int len); + typedef int (*mblen_converter) (const unsigned char *mbstr); typedef int (*mbdisplaylen_converter) (const unsigned char *mbstr); +typedef int (*mbverifier) (const unsigned char *mbstr, int len); + typedef struct { mb2wchar_with_len_converter mb2wchar_with_len; /* convert a multibyte * string to a wchar */ - mblen_converter mblen; /* returns the length of a multibyte char */ - mbdisplaylen_converter dsplen; /* returns the lenghth of a display - * length */ - int maxmblen; /* max bytes for a char in this charset */ + mblen_converter mblen; /* get byte length of a char */ + mbdisplaylen_converter dsplen; /* get display width of a char */ + mbverifier mbverify; /* verify multibyte sequence */ + int maxmblen; /* max bytes for a char in this encoding */ } pg_wchar_tbl; extern pg_wchar_tbl pg_wchar_table[]; @@ -293,6 +301,7 @@ extern int pg_mblen(const char *mbstr); extern int pg_dsplen(const char *mbstr); extern int pg_encoding_mblen(int encoding, const char *mbstr); extern int pg_encoding_dsplen(int encoding, const char *mbstr); +extern int pg_encoding_verifymb(int encoding, const char *mbstr, int len); extern int pg_mule_mblen(const unsigned char *mbstr); extern int pg_mic_mblen(const unsigned char *mbstr); extern int pg_mbstrlen(const char *mbstr); @@ -326,21 +335,32 @@ extern char *pg_server_to_client(const char *s, int len); extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc); extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc); -extern void LocalToUtf(unsigned char *iso, unsigned char *utf, - pg_local_to_utf *map, int size, int encoding, int len); +extern void LocalToUtf(const unsigned char *iso, unsigned char *utf, + const pg_local_to_utf *map, int size, int encoding, int len); -extern void UtfToLocal(unsigned char *utf, unsigned char *iso, - pg_utf_to_local *map, int size, int len); +extern void UtfToLocal(const unsigned char *utf, unsigned char *iso, + const pg_utf_to_local *map, int size, int encoding, int len); extern bool pg_verifymbstr(const char *mbstr, int len, bool noError); - -extern void pg_ascii2mic(unsigned char *src, unsigned char *dest, int len); -extern void pg_mic2ascii(unsigned char *src, unsigned char *dest, int len); -extern void pg_print_bogus_char(unsigned char **mic, unsigned char **p); -extern void latin2mic(unsigned char *l, unsigned char *p, int len, int lc); -extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc); -extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab); -extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab); +extern bool pg_verify_mbstr(int encoding, const char *mbstr, int len, + bool noError); + +extern void report_invalid_encoding(int encoding, const char *mbstr, int len); +extern void report_untranslatable_char(int src_encoding, int dest_encoding, + const char *mbstr, int len); + +extern void pg_ascii2mic(const unsigned char *l, unsigned char *p, int len); +extern void pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len); +extern void latin2mic(const unsigned char *l, unsigned char *p, int len, + int lc, int encoding); +extern void mic2latin(const unsigned char *mic, unsigned char *p, int len, + int lc, int encoding); +extern void latin2mic_with_table(const unsigned char *l, unsigned char *p, + int len, int lc, int encoding, + const unsigned char *tab); +extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p, + int len, int lc, int encoding, + const unsigned char *tab); extern bool pg_utf8_islegal(const unsigned char *source, int length); diff --git a/src/test/mb/expected/mule_internal.out b/src/test/mb/expected/mule_internal.out index fa1f836febe..ac8b57dc421 100644 --- a/src/test/mb/expected/mule_internal.out +++ b/src/test/mb/expected/mule_internal.out @@ -8,81 +8,81 @@ insert into גђ values('Ԓ咡钥ՒÒ',' insert into גђ values('Ԓ咡ג풥钥ޒ','Z01'); vacuum גђ; select * from גђ; - ђ | ʬ | 1a ------------------------------------------+-----------------+---------------- - Ԓ咡ǒג쒥 | A01 | - Ԓ咡钥ՒÒ | ʬB10 | - Ԓ咡ג풥钥ޒ | Z01 | + ђ | ʬ | 1a +----------------------------+------------+------------ + Ԓ咡ǒג쒥 | A01 | + Ԓ咡钥ՒÒ | ʬB10 | + Ԓ咡ג풥钥ޒ | Z01 | (3 rows) select * from גђ where ʬ = 'Z01'; - ђ | ʬ | 1a ---------------------------------------+-----------------+---------------- - Ԓ咡ג풥钥ޒ | Z01 | + ђ | ʬ | 1a +--------------------------+------------+------------ + Ԓ咡ג풥钥ޒ | Z01 | (1 row) select * from גђ where ʬ ~* 'z01'; - ђ | ʬ | 1a ---------------------------------------+-----------------+---------------- - Ԓ咡ג풥钥ޒ | Z01 | + ђ | ʬ | 1a +--------------------------+------------+------------ + Ԓ咡ג풥钥ޒ | Z01 | (1 row) select * from גђ where ʬ like '_Z01_'; - ђ | ʬ | 1a ---------------------------------------+-----------------+---------------- - Ԓ咡ג풥钥ޒ | Z01 | + ђ | ʬ | 1a +--------------------------+------------+------------ + Ԓ咡ג풥钥ޒ | Z01 | (1 row) select * from גђ where ʬ like '_Z%'; - ђ | ʬ | 1a ---------------------------------------+-----------------+---------------- - Ԓ咡ג풥钥ޒ | Z01 | + ђ | ʬ | 1a +--------------------------+------------+------------ + Ԓ咡ג풥钥ޒ | Z01 | (1 row) select * from גђ where ђ ~ 'Ԓ咡[ǒ]'; - ђ | ʬ | 1a ------------------------------------------+-----------------+---------------- - Ԓ咡ǒג쒥 | A01 | - Ԓ咡钥ՒÒ | ʬB10 | + ђ | ʬ | 1a +----------------------------+------------+------------ + Ԓ咡ǒג쒥 | A01 | + Ԓ咡钥ՒÒ | ʬB10 | (2 rows) select * from גђ where ђ ~* 'Ԓ咡[ǒ]'; - ђ | ʬ | 1a ------------------------------------------+-----------------+---------------- - Ԓ咡ǒג쒥 | A01 | - Ԓ咡钥ՒÒ | ʬB10 | + ђ | ʬ | 1a +----------------------------+------------+------------ + Ԓ咡ǒג쒥 | A01 | + Ԓ咡钥ՒÒ | ʬB10 | (2 rows) select *,character_length(ђ) from גђ; - ђ | ʬ | 1a | character_length ------------------------------------------+-----------------+----------------+------------------ - Ԓ咡ǒג쒥 | A01 | | 12 - Ԓ咡钥ՒÒ | ʬB10 | | 13 - Ԓ咡ג풥钥ޒ | Z01 | | 12 + ђ | ʬ | 1a | character_length +----------------------------+------------+------------+------------------ + Ԓ咡ǒג쒥 | A01 | | 12 + Ԓ咡钥ՒÒ | ʬB10 | | 13 + Ԓ咡ג풥钥ޒ | Z01 | | 12 (3 rows) select *,octet_length(ђ) from גђ; - ђ | ʬ | 1a | octet_length ------------------------------------------+-----------------+----------------+-------------- - Ԓ咡ǒג쒥 | A01 | | 36 - Ԓ咡钥ՒÒ | ʬB10 | | 39 - Ԓ咡ג풥钥ޒ | Z01 | | 36 + ђ | ʬ | 1a | octet_length +----------------------------+------------+------------+-------------- + Ԓ咡ǒג쒥 | A01 | | 36 + Ԓ咡钥ՒÒ | ʬB10 | | 39 + Ԓ咡ג풥钥ޒ | Z01 | | 36 (3 rows) select *,position('' in ђ) from גђ; - ђ | ʬ | 1a | position ------------------------------------------+-----------------+----------------+---------- - Ԓ咡ǒג쒥 | A01 | | 7 - Ԓ咡钥ՒÒ | ʬB10 | | 0 - Ԓ咡ג풥钥ޒ | Z01 | | 0 + ђ | ʬ | 1a | position +----------------------------+------------+------------+---------- + Ԓ咡ǒג쒥 | A01 | | 7 + Ԓ咡钥ՒÒ | ʬB10 | | 0 + Ԓ咡ג풥钥ޒ | Z01 | | 0 (3 rows) select *,substring(ђ from 10 for 4) from גђ; - ђ | ʬ | 1a | substring ------------------------------------------+-----------------+----------------+-------------- - Ԓ咡ǒג쒥 | A01 | | ג쒥 - Ԓ咡钥ՒÒ | ʬB10 | | Ò - Ԓ咡ג풥钥ޒ | Z01 | | 钥ޒ + ђ | ʬ | 1a | substring +----------------------------+------------+------------+----------- + Ԓ咡ǒג쒥 | A01 | | ג쒥 + Ԓ咡钥ՒÒ | ʬB10 | | Ò + Ԓ咡ג풥钥ޒ | Z01 | | 钥ޒ (3 rows) drop table Ƒ㑻; @@ -95,81 +95,81 @@ insert into Ƒ㑻 values('ԑͼ','B01'); insert into Ƒ㑻 values('ԑ̑Ա','Z01'); vacuum Ƒ㑻; select * from Ƒ㑻; - | ֑ | ע1a ------------------+-----------+---------- + | ֑ | ע1a +------------+---------+-------- ԑԑʾ | A01 | - ԑͼ | B01 | + ԑͼ | B01 | ԑ̑Ա | Z01 | (3 rows) select * from Ƒ㑻 where ֑ = 'Z01'; - | ֑ | ע1a ------------------+-----------+---------- + | ֑ | ע1a +------------+---------+-------- ԑ̑Ա | Z01 | (1 row) select * from Ƒ㑻 where ֑ ~* 'z01'; - | ֑ | ע1a ------------------+-----------+---------- + | ֑ | ע1a +------------+---------+-------- ԑ̑Ա | Z01 | (1 row) select * from Ƒ㑻 where ֑ like '_Z01_'; - | ֑ | ע1a ------------------+-----------+---------- + | ֑ | ע1a +------------+---------+-------- ԑ̑Ա | Z01 | (1 row) select * from Ƒ㑻 where ֑ like '_Z%'; - | ֑ | ע1a ------------------+-----------+---------- + | ֑ | ע1a +------------+---------+-------- ԑ̑Ա | Z01 | (1 row) select * from Ƒ㑻 where ~ '[ԑͼ]'; - | ֑ | ע1a ------------------+-----------+---------- + | ֑ | ע1a +------------+---------+-------- ԑԑʾ | A01 | - ԑͼ | B01 | + ԑͼ | B01 | (2 rows) select * from Ƒ㑻 where ~* '[ԑͼ]'; - | ֑ | ע1a ------------------+-----------+---------- + | ֑ | ע1a +------------+---------+-------- ԑԑʾ | A01 | - ԑͼ | B01 | + ԑͼ | B01 | (2 rows) select *,character_length() from Ƒ㑻; - | ֑ | ע1a | character_length ------------------+-----------+----------+------------------ - ԑԑʾ | A01 | | 5 - ԑͼ | B01 | | 4 - ԑ̑Ա | Z01 | | 5 + | ֑ | ע1a | character_length +------------+---------+--------+------------------ + ԑԑʾ | A01 | | 5 + ԑͼ | B01 | | 4 + ԑ̑Ա | Z01 | | 5 (3 rows) select *,octet_length() from Ƒ㑻; - | ֑ | ע1a | octet_length ------------------+-----------+----------+-------------- - ԑԑʾ | A01 | | 15 - ԑͼ | B01 | | 12 - ԑ̑Ա | Z01 | | 15 + | ֑ | ע1a | octet_length +------------+---------+--------+-------------- + ԑԑʾ | A01 | | 15 + ԑͼ | B01 | | 12 + ԑ̑Ա | Z01 | | 15 (3 rows) select *,position('' in ) from Ƒ㑻; - | ֑ | ע1a | position ------------------+-----------+----------+---------- - ԑԑʾ | A01 | | 3 - ԑͼ | B01 | | 0 - ԑ̑Ա | Z01 | | 0 + | ֑ | ע1a | position +------------+---------+--------+---------- + ԑԑʾ | A01 | | 3 + ԑͼ | B01 | | 0 + ԑ̑Ա | Z01 | | 0 (3 rows) select *,substring( from 3 for 4) from Ƒ㑻; - | ֑ | ע1a | substring ------------------+-----------+----------+----------- - ԑԑʾ | A01 | | ԑʾ - ԑͼ | B01 | | ͼ - ԑ̑Ա | Z01 | | ̑Ա + | ֑ | ע1a | substring +------------+---------+--------+----------- + ԑԑʾ | A01 | | ԑʾ + ԑͼ | B01 | | ͼ + ԑ̑Ա | Z01 | | ̑Ա (3 rows) drop table ͪߩѦ듾; @@ -182,81 +182,81 @@ insert into ͪߩѦ듾 values('ēǻ͓דȓ', 'B10'); insert into ͪߩѦ듾 values('ēǻ͓Γד', 'Z01'); vacuum ͪߩѦ듾; select * from ͪߩѦ듾; - 듾 | ړ | 1a ---------------------------+--------------+---------------- - ēǻ͓Ó | ѦA01߾ | - ēǻ͓דȓ | B10 | - ēǻ͓Γד | Z01 | + 듾 | ړ | 1a +------------------+----------+------------ + ēǻ͓Ó | ѦA01߾ | + ēǻ͓דȓ | B10 | + ēǻ͓Γד | Z01 | (3 rows) select * from ͪߩѦ듾 where ړ = 'Z01'; - 듾 | ړ | 1a ---------------------------+--------------+---------------- - ēǻ͓Γד | Z01 | + 듾 | ړ | 1a +------------------+----------+------------ + ēǻ͓Γד | Z01 | (1 row) select * from ͪߩѦ듾 where ړ ~* 'z01'; - 듾 | ړ | 1a ---------------------------+--------------+---------------- - ēǻ͓Γד | Z01 | + 듾 | ړ | 1a +------------------+----------+------------ + ēǻ͓Γד | Z01 | (1 row) select * from ͪߩѦ듾 where ړ like '_Z01_'; - 듾 | ړ | 1a ---------------------------+--------------+---------------- - ēǻ͓Γד | Z01 | + 듾 | ړ | 1a +------------------+----------+------------ + ēǻ͓Γד | Z01 | (1 row) select * from ͪߩѦ듾 where ړ like '_Z%'; - 듾 | ړ | 1a ---------------------------+--------------+---------------- - ēǻ͓Γד | Z01 | + 듾 | ړ | 1a +------------------+----------+------------ + ēǻ͓Γד | Z01 | (1 row) select * from ͪߩѦ듾 where 듾 ~ 'ēǻ[]'; - 듾 | ړ | 1a ---------------------------+--------------+---------------- - ēǻ͓Ó | ѦA01߾ | - ēǻ͓דȓ | B10 | + 듾 | ړ | 1a +------------------+----------+------------ + ēǻ͓Ó | ѦA01߾ | + ēǻ͓דȓ | B10 | (2 rows) select * from ͪߩѦ듾 where 듾 ~* 'ēǻ[]'; - 듾 | ړ | 1a ---------------------------+--------------+---------------- - ēǻ͓Ó | ѦA01߾ | - ēǻ͓דȓ | B10 | + 듾 | ړ | 1a +------------------+----------+------------ + ēǻ͓Ó | ѦA01߾ | + ēǻ͓דȓ | B10 | (2 rows) select *,character_length(듾) from ͪߩѦ듾; - 듾 | ړ | 1a | character_length ---------------------------+--------------+----------------+------------------ - ēǻ͓Ó | ѦA01߾ | | 8 - ēǻ͓דȓ | B10 | | 7 - ēǻ͓Γד | Z01 | | 8 + 듾 | ړ | 1a | character_length +------------------+----------+------------+------------------ + ēǻ͓Ó | ѦA01߾ | | 8 + ēǻ͓דȓ | B10 | | 7 + ēǻ͓Γד | Z01 | | 8 (3 rows) select *,octet_length(듾) from ͪߩѦ듾; - 듾 | ړ | 1a | octet_length ---------------------------+--------------+----------------+-------------- - ēǻ͓Ó | ѦA01߾ | | 24 - ēǻ͓דȓ | B10 | | 21 - ēǻ͓Γד | Z01 | | 24 + 듾 | ړ | 1a | octet_length +------------------+----------+------------+-------------- + ēǻ͓Ó | ѦA01߾ | | 24 + ēǻ͓דȓ | B10 | | 21 + ēǻ͓Γד | Z01 | | 24 (3 rows) select *,position('' in 듾) from ͪߩѦ듾; - 듾 | ړ | 1a | position ---------------------------+--------------+----------------+---------- - ēǻ͓Ó | ѦA01߾ | | 4 - ēǻ͓דȓ | B10 | | 0 - ēǻ͓Γד | Z01 | | 0 + 듾 | ړ | 1a | position +------------------+----------+------------+---------- + ēǻ͓Ó | ѦA01߾ | | 4 + ēǻ͓דȓ | B10 | | 0 + ēǻ͓Γד | Z01 | | 0 (3 rows) select *,substring(듾 from 3 for 4) from ͪߩѦ듾; - 듾 | ړ | 1a | substring ---------------------------+--------------+----------------+-------------- - ēǻ͓Ó | ѦA01߾ | | ͓ - ēǻ͓דȓ | B10 | | ͓ד - ēǻ͓Γד | Z01 | | ͓Γ + 듾 | ړ | 1a | substring +------------------+----------+------------+----------- + ēǻ͓Ó | ѦA01߾ | | ͓ + ēǻ͓דȓ | B10 | | ͓ד + ēǻ͓Γד | Z01 | | ͓Γ (3 rows) drop table test; @@ -269,8 +269,8 @@ insert into test values('SLENSKA'); insert into test values('ENGLISH FRANAIS ESPAOL SLENSKA'); vacuum test; select * from test; - t --------------------------------------- + t +----------------------------------- ENGLISH FRANAIS ESPAOL @@ -279,55 +279,55 @@ select * from test; (5 rows) select * from test where t = 'ESPAOL'; - t ----------- + t +--------- ESPAOL (1 row) select * from test where t ~* 'espaol'; - t --------------------------------------- + t +----------------------------------- ESPAOL ENGLISH FRANAIS ESPAOL SLENSKA (2 rows) select *,character_length(t) from test; - t | character_length ---------------------------------------+------------------ - ENGLISH | 7 - FRANAIS | 8 - ESPAOL | 7 - SLENSKA | 8 + t | character_length +-----------------------------------+------------------ + ENGLISH | 7 + FRANAIS | 8 + ESPAOL | 7 + SLENSKA | 8 ENGLISH FRANAIS ESPAOL SLENSKA | 33 (5 rows) select *,octet_length(t) from test; - t | octet_length ---------------------------------------+-------------- - ENGLISH | 7 - FRANAIS | 9 - ESPAOL | 8 - SLENSKA | 9 + t | octet_length +-----------------------------------+-------------- + ENGLISH | 7 + FRANAIS | 9 + ESPAOL | 8 + SLENSKA | 9 ENGLISH FRANAIS ESPAOL SLENSKA | 36 (5 rows) select *,position('L' in t) from test; - t | position ---------------------------------------+---------- - ENGLISH | 4 - FRANAIS | 0 - ESPAOL | 7 - SLENSKA | 3 + t | position +-----------------------------------+---------- + ENGLISH | 4 + FRANAIS | 0 + ESPAOL | 7 + SLENSKA | 3 ENGLISH FRANAIS ESPAOL SLENSKA | 4 (5 rows) select *,substring(t from 3 for 4) from test; - t | substring ---------------------------------------+----------- - ENGLISH | GLIS - FRANAIS | ANA - ESPAOL | PAO - SLENSKA | LENS + t | substring +-----------------------------------+----------- + ENGLISH | GLIS + FRANAIS | ANA + ESPAOL | PAO + SLENSKA | LENS ENGLISH FRANAIS ESPAOL SLENSKA | GLIS (5 rows) |