aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/mb/conv.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/mb/conv.c')
-rw-r--r--src/backend/utils/mb/conv.c413
1 files changed, 193 insertions, 220 deletions
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c
index a24f69afcab..64554f6052c 100644
--- a/src/backend/utils/mb/conv.c
+++ b/src/backend/utils/mb/conv.c
@@ -6,172 +6,81 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/utils/mb/conv.c,v 1.48 2003/08/04 02:40:07 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/utils/mb/conv.c,v 1.48.4.1 2006/05/21 20:06:43 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "mb/pg_wchar.h"
-/*
- * convert bogus chars that cannot be represented in the current
- * encoding system.
- */
-void
-pg_print_bogus_char(unsigned char **mic, unsigned char **p)
-{
- char strbuf[16];
- int l = pg_mic_mblen(*mic);
-
- *(*p)++ = '(';
- while (l--)
- {
- sprintf(strbuf, "%02x", *(*mic)++);
- *(*p)++ = strbuf[0];
- *(*p)++ = strbuf[1];
- }
- *(*p)++ = ')';
-}
-
-#ifdef NOT_USED
-
-/*
- * GB18030 ---> MIC
- * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
- */
-static void
-gb180302mic(unsigned char *gb18030, unsigned char *p, int len)
-{
- int c1;
- int c2;
-
- while (len > 0 && (c1 = *gb18030++))
- {
- if (c1 < 0x80)
- { /* should be ASCII */
- len--;
- *p++ = c1;
- }
- else if (c1 >= 0x81 && c1 <= 0xfe)
- {
- c2 = *gb18030++;
-
- if (c2 >= 0x30 && c2 <= 0x69)
- {
- len -= 4;
- *p++ = c1;
- *p++ = c2;
- *p++ = *gb18030++;
- *p++ = *gb18030++;
- *p++ = *gb18030++;
- }
- else if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe))
- {
- len -= 2;
- *p++ = c1;
- *p++ = c2;
- *p++ = *gb18030++;
- }
- else
- { /* throw the strange code */
- len--;
- }
- }
- }
- *p = '\0';
-}
/*
- * MIC ---> GB18030
- * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
- */
-static void
-mic2gb18030(unsigned char *mic, unsigned char *p, int len)
-{
- int c1;
- int c2;
-
- while (len > 0 && (c1 = *mic))
- {
- len -= pg_mic_mblen(mic++);
-
- if (c1 <= 0x7f) /* ASCII */
- *p++ = c1;
- else if (c1 >= 0x81 && c1 <= 0xfe)
- {
- c2 = *mic++;
-
- if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe))
- {
- *p++ = c1;
- *p++ = c2;
- }
- else if (c2 >= 0x30 && c2 <= 0x39)
- {
- *p++ = c1;
- *p++ = c2;
- *p++ = *mic++;
- *p++ = *mic++;
- }
- else
- {
- mic--;
- pg_print_bogus_char(&mic, &p);
- mic--;
- pg_print_bogus_char(&mic, &p);
- }
- }
- else
- {
- mic--;
- pg_print_bogus_char(&mic, &p);
- }
- }
- *p = '\0';
-}
-#endif
-
-/*
- * LATINn ---> MIC
+ * LATINn ---> MIC when the charset's local codes map directly to MIC
+ *
+ * l points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
*/
void
-latin2mic(unsigned char *l, unsigned char *p, int len, int lc)
+latin2mic(const unsigned char *l, unsigned char *p, int len,
+ int lc, int encoding)
{
int c1;
- while (len-- > 0 && (c1 = *l++))
+ while (len > 0)
{
- if (c1 > 0x7f)
- { /* Latin? */
+ c1 = *l;
+ if (c1 == 0)
+ report_invalid_encoding(encoding, (const char *) l, len);
+ if (IS_HIGHBIT_SET(c1))
*p++ = lc;
- }
*p++ = c1;
+ l++;
+ len--;
}
*p = '\0';
}
/*
- * MIC ---> LATINn
+ * MIC ---> LATINn when the charset's local codes map directly to MIC
+ *
+ * mic points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
*/
void
-mic2latin(unsigned char *mic, unsigned char *p, int len, int lc)
+mic2latin(const unsigned char *mic, unsigned char *p, int len,
+ int lc, int encoding)
{
int c1;
- while (len > 0 && (c1 = *mic))
+ while (len > 0)
{
- len -= pg_mic_mblen(mic++);
-
- if (c1 == lc)
- *p++ = *mic++;
- else if (c1 > 0x7f)
+ c1 = *mic;
+ if (c1 == 0)
+ report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+ if (!IS_HIGHBIT_SET(c1))
{
- mic--;
- pg_print_bogus_char(&mic, &p);
+ /* easy for ASCII */
+ *p++ = c1;
+ mic++;
+ len--;
}
else
- { /* should be ASCII */
- *p++ = c1;
+ {
+ int l = pg_mic_mblen(mic);
+
+ if (len < l)
+ report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
+ len);
+ if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
+ report_untranslatable_char(PG_MULE_INTERNAL, encoding,
+ (const char *) mic, len);
+ *p++ = mic[1];
+ mic += 2;
+ len -= 2;
}
}
*p = '\0';
@@ -180,14 +89,25 @@ mic2latin(unsigned char *mic, unsigned char *p, int len, int lc)
/*
* ASCII ---> MIC
+ *
+ * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
+ * characters, here we must take a hard line because we don't know
+ * the appropriate MIC equivalent.
*/
void
-pg_ascii2mic(unsigned char *l, unsigned char *p, int len)
+pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
{
int c1;
- while (len-- > 0 && (c1 = *l++))
- *p++ = (c1 & 0x7f);
+ while (len > 0)
+ {
+ c1 = *l;
+ if (c1 == 0 || IS_HIGHBIT_SET(c1))
+ report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
+ *p++ = c1;
+ l++;
+ len--;
+ }
*p = '\0';
}
@@ -195,19 +115,19 @@ pg_ascii2mic(unsigned char *l, unsigned char *p, int len)
* MIC ---> ASCII
*/
void
-pg_mic2ascii(unsigned char *mic, unsigned char *p, int len)
+pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len)
{
int c1;
- while (len-- > 0 && (c1 = *mic))
+ while (len > 0)
{
- if (c1 > 0x7f)
- pg_print_bogus_char(&mic, &p);
- else
- { /* should be ASCII */
- *p++ = c1;
- mic++;
- }
+ c1 = *mic;
+ if (c1 == 0 || IS_HIGHBIT_SET(c1))
+ report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII,
+ (const char *) mic, len);
+ *p++ = c1;
+ mic++;
+ len--;
}
*p = '\0';
}
@@ -215,87 +135,103 @@ pg_mic2ascii(unsigned char *mic, unsigned char *p, int len)
/*
* latin2mic_with_table: a generic single byte charset encoding
* conversion from a local charset to the mule internal code.
- * with a encoding conversion table.
- * the table is ordered according to the local charset,
+ *
+ * l points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
+ * tab holds conversion entries for the local charset
* starting from 128 (0x80). each entry in the table
* holds the corresponding code point for the mule internal code.
*/
void
-latin2mic_with_table(
- unsigned char *l, /* local charset string (source) */
- unsigned char *p, /* pointer to store mule internal
- * code (destination) */
- int len, /* length of l */
- int lc, /* leading character of p */
- unsigned char *tab /* code conversion table */
-)
+latin2mic_with_table(const unsigned char *l,
+ unsigned char *p,
+ int len,
+ int lc,
+ int encoding,
+ const unsigned char *tab)
{
unsigned char c1,
c2;
- while (len-- > 0 && (c1 = *l++))
+ while (len > 0)
{
- if (c1 < 128)
+ c1 = *l;
+ if (c1 == 0)
+ report_invalid_encoding(encoding, (const char *) l, len);
+ if (!IS_HIGHBIT_SET(c1))
*p++ = c1;
else
{
- c2 = tab[c1 - 128];
+ c2 = tab[c1 - HIGHBIT];
if (c2)
{
*p++ = lc;
*p++ = c2;
}
else
- {
- *p++ = ' '; /* cannot convert */
- }
+ report_untranslatable_char(encoding, PG_MULE_INTERNAL,
+ (const char *) l, len);
}
+ l++;
+ len--;
}
*p = '\0';
}
/*
* mic2latin_with_table: a generic single byte charset encoding
- * conversion from the mule internal code to a local charset
- * with a encoding conversion table.
- * the table is ordered according to the second byte of the mule
- * internal code starting from 128 (0x80).
- * each entry in the table
- * holds the corresponding code point for the local code.
+ * conversion from the mule internal code to a local charset.
+ *
+ * mic points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
+ * tab holds conversion entries for the mule internal code's
+ * second byte, starting from 128 (0x80). each entry in the table
+ * holds the corresponding code point for the local charset.
*/
void
-mic2latin_with_table(
- unsigned char *mic, /* mule internal code
- * (source) */
- unsigned char *p, /* local code (destination) */
- int len, /* length of p */
- int lc, /* leading character */
- unsigned char *tab /* code conversion table */
-)
+mic2latin_with_table(const unsigned char *mic,
+ unsigned char *p,
+ int len,
+ int lc,
+ int encoding,
+ const unsigned char *tab)
{
-
unsigned char c1,
c2;
- while (len-- > 0 && (c1 = *mic++))
+ while (len > 0)
{
- if (c1 < 128)
- *p++ = c1;
- else if (c1 == lc)
+ c1 = *mic;
+ if (c1 == 0)
+ report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+ if (!IS_HIGHBIT_SET(c1))
{
- c1 = *mic++;
+ /* easy for ASCII */
+ *p++ = c1;
+ mic++;
len--;
- c2 = tab[c1 - 128];
- if (c2)
- *p++ = c2;
- else
- {
- *p++ = ' '; /* cannot convert */
- }
}
else
{
- *p++ = ' '; /* bogus character */
+ int l = pg_mic_mblen(mic);
+
+ if (len < l)
+ report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
+ len);
+ if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
+ (c2 = tab[mic[1] - HIGHBIT]) == 0)
+ {
+ report_untranslatable_char(PG_MULE_INTERNAL, encoding,
+ (const char *) mic, len);
+ break; /* keep compiler quiet */
+ }
+ *p++ = c2;
+ mic += 2;
+ len -= 2;
}
}
*p = '\0';
@@ -332,27 +268,40 @@ compare2(const void *p1, const void *p2)
}
/*
- * UTF-8 ---> local code
+ * UTF8 ---> local code
*
- * utf: input UTF-8 string. Its length is limited by "len" parameter
- * or a null terminator.
- * iso: pointer to the output.
+ * utf: input UTF8 string (need not be null-terminated).
+ * iso: pointer to the output area (must be large enough!)
* map: the conversion map.
* size: the size of the conversion map.
+ * encoding: the PG identifier for the local encoding.
+ * len: length of input string.
*/
void
-UtfToLocal(unsigned char *utf, unsigned char *iso,
- pg_utf_to_local *map, int size, int len)
+UtfToLocal(const unsigned char *utf, unsigned char *iso,
+ const pg_utf_to_local *map, int size, int encoding, int len)
{
unsigned int iutf;
int l;
pg_utf_to_local *p;
- for (; len > 0 && *utf; len -= l)
+ for (; len > 0; len -= l)
{
+ /* "break" cases all represent errors */
+ if (*utf == '\0')
+ break;
+
l = pg_utf_mblen(utf);
+
+ if (len < l)
+ break;
+
+ if (!pg_utf8_islegal(utf, l))
+ break;
+
if (l == 1)
{
+ /* ASCII case is easy */
*iso++ = *utf++;
continue;
}
@@ -361,22 +310,27 @@ UtfToLocal(unsigned char *utf, unsigned char *iso,
iutf = *utf++ << 8;
iutf |= *utf++;
}
- else
+ else if (l == 3)
{
iutf = *utf++ << 16;
iutf |= *utf++ << 8;
iutf |= *utf++;
}
+ else if (l == 4)
+ {
+ iutf = *utf++ << 24;
+ iutf |= *utf++ << 16;
+ iutf |= *utf++ << 8;
+ iutf |= *utf++;
+ }
+
p = bsearch(&iutf, map, size,
sizeof(pg_utf_to_local), compare1);
+
if (p == NULL)
- {
- ereport(WARNING,
- (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
- errmsg("ignoring unconvertible UTF-8 character 0x%04x",
- iutf)));
- continue;
- }
+ report_untranslatable_char(PG_UTF8, encoding,
+ (const char *) (utf - l), len);
+
if (p->code & 0xff000000)
*iso++ = p->code >> 24;
if (p->code & 0x00ff0000)
@@ -386,15 +340,26 @@ UtfToLocal(unsigned char *utf, unsigned char *iso,
if (p->code & 0x000000ff)
*iso++ = p->code & 0x000000ff;
}
+
+ if (len > 0)
+ report_invalid_encoding(PG_UTF8, (const char *) utf, len);
+
*iso = '\0';
}
/*
- * local code ---> UTF-8
+ * local code ---> UTF8
+ *
+ * iso: input local string (need not be null-terminated).
+ * utf: pointer to the output area (must be large enough!)
+ * map: the conversion map.
+ * size: the size of the conversion map.
+ * encoding: the PG identifier for the local encoding.
+ * len: length of input string.
*/
void
-LocalToUtf(unsigned char *iso, unsigned char *utf,
- pg_local_to_utf *map, int size, int encoding, int len)
+LocalToUtf(const unsigned char *iso, unsigned char *utf,
+ const pg_local_to_utf *map, int size, int encoding, int len)
{
unsigned int iiso;
int l;
@@ -405,16 +370,23 @@ LocalToUtf(unsigned char *iso, unsigned char *utf,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid encoding number: %d", encoding)));
- for (; len > 0 && *iso; len -= l)
+ for (; len > 0; len -= l)
{
- if (*iso < 0x80)
+ /* "break" cases all represent errors */
+ if (*iso == '\0')
+ break;
+
+ if (!IS_HIGHBIT_SET(*iso))
{
+ /* ASCII case is easy */
*utf++ = *iso++;
l = 1;
continue;
}
- l = pg_encoding_mblen(encoding, iso);
+ l = pg_encoding_verifymb(encoding, (const char *) iso, len);
+ if (l < 0)
+ break;
if (l == 1)
iiso = *iso++;
@@ -436,16 +408,13 @@ LocalToUtf(unsigned char *iso, unsigned char *utf,
iiso |= *iso++ << 8;
iiso |= *iso++;
}
+
p = bsearch(&iiso, map, size,
sizeof(pg_local_to_utf), compare2);
if (p == NULL)
- {
- ereport(WARNING,
- (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
- errmsg("ignoring unconvertible %s character 0x%04x",
- (&pg_enc2name_tbl[encoding])->name, iiso)));
- continue;
- }
+ report_untranslatable_char(encoding, PG_UTF8,
+ (const char *) (iso - l), len);
+
if (p->utf & 0xff000000)
*utf++ = p->utf >> 24;
if (p->utf & 0x00ff0000)
@@ -455,5 +424,9 @@ LocalToUtf(unsigned char *iso, unsigned char *utf,
if (p->utf & 0x000000ff)
*utf++ = p->utf & 0x000000ff;
}
+
+ if (len > 0)
+ report_invalid_encoding(encoding, (const char *) iso, len);
+
*utf = '\0';
}