diff options
Diffstat (limited to 'src/backend/utils/mb/conv.c')
-rw-r--r-- | src/backend/utils/mb/conv.c | 241 |
1 files changed, 207 insertions, 34 deletions
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c index 81a76832e05..008ce868735 100644 --- a/src/backend/utils/mb/conv.c +++ b/src/backend/utils/mb/conv.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.62 2007/01/05 22:19:44 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.63 2007/03/25 11:56:02 ishii Exp $ * *------------------------------------------------------------------------- */ @@ -244,10 +244,10 @@ mic2latin_with_table(const unsigned char *mic, static int compare1(const void *p1, const void *p2) { - unsigned int v1, + uint32 v1, v2; - v1 = *(unsigned int *) p1; + v1 = *(uint32 *) p1; v2 = ((pg_utf_to_local *) p2)->utf; return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); } @@ -259,31 +259,86 @@ compare1(const void *p1, const void *p2) static int compare2(const void *p1, const void *p2) { - unsigned int v1, + uint32 v1, v2; - v1 = *(unsigned int *) p1; + v1 = *(uint32 *) p1; v2 = ((pg_local_to_utf *) p2)->code; return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); } /* + * comparison routine for bsearch() + * this routine is intended for combined UTF8 -> local code + */ +static int +compare3(const void *p1, const void *p2) +{ + uint32 s1, s2, d1, d2; + + s1 = *(uint32 *)p1; + s2 = *((uint32 *)p1 + 1); + d1 = ((pg_utf_to_local_combined *) p2)->utf1; + d2 = ((pg_utf_to_local_combined *) p2)->utf2; + return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1); +} + +/* + * comparison routine for bsearch() + * this routine is intended for local code -> combined UTF8 + */ +static int +compare4(const void *p1, const void *p2) +{ + uint32 v1, + v2; + + v1 = *(uint32 *) p1; + v2 = ((pg_local_to_utf_combined *) p2)->code; + return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); +} + +/* + * convert 32bit wide character to mutibye stream pointed to by iso + */ +static unsigned char *set_iso_code(unsigned char *iso, uint32 code) +{ + if (code & 0xff000000) + *iso++ = code >> 24; + if (code & 0x00ff0000) + *iso++ = (code & 0x00ff0000) >> 16; + if (code & 0x0000ff00) + *iso++ = (code & 0x0000ff00) >> 8; + if (code & 0x000000ff) + *iso++ = code & 0x000000ff; + return iso; +} + +/* * UTF8 ---> local code * * utf: input UTF8 string (need not be null-terminated). * iso: pointer to the output area (must be large enough!) * map: the conversion map. - * size: the size of the conversion map. + * cmap: the conversion map for combined characters. + * (optional) + * size1: the size of the conversion map. + * size2: the size of the conversion map for combined characters + * (optional) * encoding: the PG identifier for the local encoding. * len: length of input string. */ void UtfToLocal(const unsigned char *utf, unsigned char *iso, - const pg_utf_to_local *map, int size, int encoding, int len) + const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap, + int size1, int size2, int encoding, int len) { - unsigned int iutf; - int l; + uint32 iutf; + uint32 cutf[2]; + uint32 code; pg_utf_to_local *p; + pg_utf_to_local_combined *cp; + int l; for (; len > 0; len -= l) { @@ -324,21 +379,94 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso, iutf |= *utf++; } - p = bsearch(&iutf, map, size, - sizeof(pg_utf_to_local), compare1); + /* + * first, try with combined map if possible + */ + if (cmap && len > l) + { + const unsigned char *utf_save = utf; + int len_save = len; + int l_save = l; + + len -= l; - if (p == NULL) - report_untranslatable_char(PG_UTF8, encoding, - (const char *) (utf - l), len); - - if (p->code & 0xff000000) - *iso++ = p->code >> 24; - if (p->code & 0x00ff0000) - *iso++ = (p->code & 0x00ff0000) >> 16; - if (p->code & 0x0000ff00) - *iso++ = (p->code & 0x0000ff00) >> 8; - if (p->code & 0x000000ff) - *iso++ = p->code & 0x000000ff; + l = pg_utf_mblen(utf); + if (len < l) + break; + + if (!pg_utf8_islegal(utf, l)) + break; + + cutf[0] = iutf; + + if (l == 1) + { + if (len_save > 1) + { + p = bsearch(&cutf[0], map, size1, + sizeof(pg_utf_to_local), compare1); + if (p == NULL) + report_untranslatable_char(PG_UTF8, encoding, + (const char *) (utf_save - l_save), len_save); + iso = set_iso_code(iso, p->code); + } + + /* ASCII case is easy */ + *iso++ = *utf++; + continue; + } + else if (l == 2) + { + iutf = *utf++ << 8; + iutf |= *utf++; + } + else if (l == 3) + { + iutf = *utf++ << 16; + iutf |= *utf++ << 8; + iutf |= *utf++; + } + else if (l == 4) + { + iutf = *utf++ << 24; + iutf |= *utf++ << 16; + iutf |= *utf++ << 8; + iutf |= *utf++; + } + + cutf[1] = iutf; + cp = bsearch(cutf, cmap, size2, + sizeof(pg_utf_to_local_combined), compare3); + if (cp) + code = cp->code; + else + { + /* not found in combined map. try with ordinary map */ + p = bsearch(&cutf[0], map, size1, + sizeof(pg_utf_to_local), compare1); + if (p == NULL) + report_untranslatable_char(PG_UTF8, encoding, + (const char *) (utf_save - l_save), len_save); + iso = set_iso_code(iso, p->code); + + p = bsearch(&cutf[1], map, size1, + sizeof(pg_utf_to_local), compare1); + if (p == NULL) + report_untranslatable_char(PG_UTF8, encoding, + (const char *) (utf - l), len); + code = p->code; + } + } + else /* no cmap or no remaining data */ + { + p = bsearch(&iutf, map, size1, + sizeof(pg_utf_to_local), compare1); + if (p == NULL) + report_untranslatable_char(PG_UTF8, encoding, + (const char *) (utf - l), len); + code = p->code; + } + iso = set_iso_code(iso, code); } if (len > 0) @@ -353,17 +481,23 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso, * iso: input local string (need not be null-terminated). * utf: pointer to the output area (must be large enough!) * map: the conversion map. - * size: the size of the conversion map. + * cmap: the conversion map for combined characters. + * (optional) + * size1: the size of the conversion map. + * size2: the size of the conversion map for combined characters + * (optional) * encoding: the PG identifier for the local encoding. * len: length of input string. */ void LocalToUtf(const unsigned char *iso, unsigned char *utf, - const pg_local_to_utf *map, int size, int encoding, int len) + const pg_local_to_utf *map, const pg_local_to_utf_combined *cmap, + int size1, int size2, int encoding, int len) { unsigned int iiso; int l; pg_local_to_utf *p; + pg_local_to_utf_combined *cp; if (!PG_VALID_ENCODING(encoding)) ereport(ERROR, @@ -409,20 +543,59 @@ LocalToUtf(const unsigned char *iso, unsigned char *utf, iiso |= *iso++; } - p = bsearch(&iiso, map, size, + p = bsearch(&iiso, map, size1, sizeof(pg_local_to_utf), compare2); + if (p == NULL) + { + /* + * not found in the ordinary map. if there's a combined + * character map, try with it + */ + if (cmap) + { + cp = bsearch(&iiso, cmap, size2, + sizeof(pg_local_to_utf_combined), compare4); + + if (cp) + { + if (cp->utf1 & 0xff000000) + *utf++ = cp->utf1 >> 24; + if (cp->utf1 & 0x00ff0000) + *utf++ = (cp->utf1 & 0x00ff0000) >> 16; + if (cp->utf1 & 0x0000ff00) + *utf++ = (cp->utf1 & 0x0000ff00) >> 8; + if (cp->utf1 & 0x000000ff) + *utf++ = cp->utf1 & 0x000000ff; + + if (cp->utf2 & 0xff000000) + *utf++ = cp->utf2 >> 24; + if (cp->utf2 & 0x00ff0000) + *utf++ = (cp->utf2 & 0x00ff0000) >> 16; + if (cp->utf2 & 0x0000ff00) + *utf++ = (cp->utf2 & 0x0000ff00) >> 8; + if (cp->utf2 & 0x000000ff) + *utf++ = cp->utf2 & 0x000000ff; + + continue; + } + } + report_untranslatable_char(encoding, PG_UTF8, (const char *) (iso - l), len); - if (p->utf & 0xff000000) - *utf++ = p->utf >> 24; - if (p->utf & 0x00ff0000) - *utf++ = (p->utf & 0x00ff0000) >> 16; - if (p->utf & 0x0000ff00) - *utf++ = (p->utf & 0x0000ff00) >> 8; - if (p->utf & 0x000000ff) - *utf++ = p->utf & 0x000000ff; + } + else + { + if (p->utf & 0xff000000) + *utf++ = p->utf >> 24; + if (p->utf & 0x00ff0000) + *utf++ = (p->utf & 0x00ff0000) >> 16; + if (p->utf & 0x0000ff00) + *utf++ = (p->utf & 0x0000ff00) >> 8; + if (p->utf & 0x000000ff) + *utf++ = p->utf & 0x000000ff; + } } if (len > 0) |