aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/mb/conv.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/mb/conv.c')
-rw-r--r--src/backend/utils/mb/conv.c241
1 files changed, 207 insertions, 34 deletions
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c
index 81a76832e05..008ce868735 100644
--- a/src/backend/utils/mb/conv.c
+++ b/src/backend/utils/mb/conv.c
@@ -6,7 +6,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.62 2007/01/05 22:19:44 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.63 2007/03/25 11:56:02 ishii Exp $
*
*-------------------------------------------------------------------------
*/
@@ -244,10 +244,10 @@ mic2latin_with_table(const unsigned char *mic,
static int
compare1(const void *p1, const void *p2)
{
- unsigned int v1,
+ uint32 v1,
v2;
- v1 = *(unsigned int *) p1;
+ v1 = *(uint32 *) p1;
v2 = ((pg_utf_to_local *) p2)->utf;
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
}
@@ -259,31 +259,86 @@ compare1(const void *p1, const void *p2)
static int
compare2(const void *p1, const void *p2)
{
- unsigned int v1,
+ uint32 v1,
v2;
- v1 = *(unsigned int *) p1;
+ v1 = *(uint32 *) p1;
v2 = ((pg_local_to_utf *) p2)->code;
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
}
/*
+ * comparison routine for bsearch()
+ * this routine is intended for combined UTF8 -> local code
+ */
+static int
+compare3(const void *p1, const void *p2)
+{
+ uint32 s1, s2, d1, d2;
+
+ s1 = *(uint32 *)p1;
+ s2 = *((uint32 *)p1 + 1);
+ d1 = ((pg_utf_to_local_combined *) p2)->utf1;
+ d2 = ((pg_utf_to_local_combined *) p2)->utf2;
+ return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
+}
+
+/*
+ * comparison routine for bsearch()
+ * this routine is intended for local code -> combined UTF8
+ */
+static int
+compare4(const void *p1, const void *p2)
+{
+ uint32 v1,
+ v2;
+
+ v1 = *(uint32 *) p1;
+ v2 = ((pg_local_to_utf_combined *) p2)->code;
+ return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
+}
+
+/*
+ * convert 32bit wide character to mutibye stream pointed to by iso
+ */
+static unsigned char *set_iso_code(unsigned char *iso, uint32 code)
+{
+ if (code & 0xff000000)
+ *iso++ = code >> 24;
+ if (code & 0x00ff0000)
+ *iso++ = (code & 0x00ff0000) >> 16;
+ if (code & 0x0000ff00)
+ *iso++ = (code & 0x0000ff00) >> 8;
+ if (code & 0x000000ff)
+ *iso++ = code & 0x000000ff;
+ return iso;
+}
+
+/*
* UTF8 ---> local code
*
* utf: input UTF8 string (need not be null-terminated).
* iso: pointer to the output area (must be large enough!)
* map: the conversion map.
- * size: the size of the conversion map.
+ * cmap: the conversion map for combined characters.
+ * (optional)
+ * size1: the size of the conversion map.
+ * size2: the size of the conversion map for combined characters
+ * (optional)
* encoding: the PG identifier for the local encoding.
* len: length of input string.
*/
void
UtfToLocal(const unsigned char *utf, unsigned char *iso,
- const pg_utf_to_local *map, int size, int encoding, int len)
+ const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap,
+ int size1, int size2, int encoding, int len)
{
- unsigned int iutf;
- int l;
+ uint32 iutf;
+ uint32 cutf[2];
+ uint32 code;
pg_utf_to_local *p;
+ pg_utf_to_local_combined *cp;
+ int l;
for (; len > 0; len -= l)
{
@@ -324,21 +379,94 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso,
iutf |= *utf++;
}
- p = bsearch(&iutf, map, size,
- sizeof(pg_utf_to_local), compare1);
+ /*
+ * first, try with combined map if possible
+ */
+ if (cmap && len > l)
+ {
+ const unsigned char *utf_save = utf;
+ int len_save = len;
+ int l_save = l;
+
+ len -= l;
- if (p == NULL)
- report_untranslatable_char(PG_UTF8, encoding,
- (const char *) (utf - l), len);
-
- if (p->code & 0xff000000)
- *iso++ = p->code >> 24;
- if (p->code & 0x00ff0000)
- *iso++ = (p->code & 0x00ff0000) >> 16;
- if (p->code & 0x0000ff00)
- *iso++ = (p->code & 0x0000ff00) >> 8;
- if (p->code & 0x000000ff)
- *iso++ = p->code & 0x000000ff;
+ l = pg_utf_mblen(utf);
+ if (len < l)
+ break;
+
+ if (!pg_utf8_islegal(utf, l))
+ break;
+
+ cutf[0] = iutf;
+
+ if (l == 1)
+ {
+ if (len_save > 1)
+ {
+ p = bsearch(&cutf[0], map, size1,
+ sizeof(pg_utf_to_local), compare1);
+ if (p == NULL)
+ report_untranslatable_char(PG_UTF8, encoding,
+ (const char *) (utf_save - l_save), len_save);
+ iso = set_iso_code(iso, p->code);
+ }
+
+ /* ASCII case is easy */
+ *iso++ = *utf++;
+ continue;
+ }
+ else if (l == 2)
+ {
+ iutf = *utf++ << 8;
+ iutf |= *utf++;
+ }
+ else if (l == 3)
+ {
+ iutf = *utf++ << 16;
+ iutf |= *utf++ << 8;
+ iutf |= *utf++;
+ }
+ else if (l == 4)
+ {
+ iutf = *utf++ << 24;
+ iutf |= *utf++ << 16;
+ iutf |= *utf++ << 8;
+ iutf |= *utf++;
+ }
+
+ cutf[1] = iutf;
+ cp = bsearch(cutf, cmap, size2,
+ sizeof(pg_utf_to_local_combined), compare3);
+ if (cp)
+ code = cp->code;
+ else
+ {
+ /* not found in combined map. try with ordinary map */
+ p = bsearch(&cutf[0], map, size1,
+ sizeof(pg_utf_to_local), compare1);
+ if (p == NULL)
+ report_untranslatable_char(PG_UTF8, encoding,
+ (const char *) (utf_save - l_save), len_save);
+ iso = set_iso_code(iso, p->code);
+
+ p = bsearch(&cutf[1], map, size1,
+ sizeof(pg_utf_to_local), compare1);
+ if (p == NULL)
+ report_untranslatable_char(PG_UTF8, encoding,
+ (const char *) (utf - l), len);
+ code = p->code;
+ }
+ }
+ else /* no cmap or no remaining data */
+ {
+ p = bsearch(&iutf, map, size1,
+ sizeof(pg_utf_to_local), compare1);
+ if (p == NULL)
+ report_untranslatable_char(PG_UTF8, encoding,
+ (const char *) (utf - l), len);
+ code = p->code;
+ }
+ iso = set_iso_code(iso, code);
}
if (len > 0)
@@ -353,17 +481,23 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso,
* iso: input local string (need not be null-terminated).
* utf: pointer to the output area (must be large enough!)
* map: the conversion map.
- * size: the size of the conversion map.
+ * cmap: the conversion map for combined characters.
+ * (optional)
+ * size1: the size of the conversion map.
+ * size2: the size of the conversion map for combined characters
+ * (optional)
* encoding: the PG identifier for the local encoding.
* len: length of input string.
*/
void
LocalToUtf(const unsigned char *iso, unsigned char *utf,
- const pg_local_to_utf *map, int size, int encoding, int len)
+ const pg_local_to_utf *map, const pg_local_to_utf_combined *cmap,
+ int size1, int size2, int encoding, int len)
{
unsigned int iiso;
int l;
pg_local_to_utf *p;
+ pg_local_to_utf_combined *cp;
if (!PG_VALID_ENCODING(encoding))
ereport(ERROR,
@@ -409,20 +543,59 @@ LocalToUtf(const unsigned char *iso, unsigned char *utf,
iiso |= *iso++;
}
- p = bsearch(&iiso, map, size,
+ p = bsearch(&iiso, map, size1,
sizeof(pg_local_to_utf), compare2);
+
if (p == NULL)
+ {
+ /*
+ * not found in the ordinary map. if there's a combined
+ * character map, try with it
+ */
+ if (cmap)
+ {
+ cp = bsearch(&iiso, cmap, size2,
+ sizeof(pg_local_to_utf_combined), compare4);
+
+ if (cp)
+ {
+ if (cp->utf1 & 0xff000000)
+ *utf++ = cp->utf1 >> 24;
+ if (cp->utf1 & 0x00ff0000)
+ *utf++ = (cp->utf1 & 0x00ff0000) >> 16;
+ if (cp->utf1 & 0x0000ff00)
+ *utf++ = (cp->utf1 & 0x0000ff00) >> 8;
+ if (cp->utf1 & 0x000000ff)
+ *utf++ = cp->utf1 & 0x000000ff;
+
+ if (cp->utf2 & 0xff000000)
+ *utf++ = cp->utf2 >> 24;
+ if (cp->utf2 & 0x00ff0000)
+ *utf++ = (cp->utf2 & 0x00ff0000) >> 16;
+ if (cp->utf2 & 0x0000ff00)
+ *utf++ = (cp->utf2 & 0x0000ff00) >> 8;
+ if (cp->utf2 & 0x000000ff)
+ *utf++ = cp->utf2 & 0x000000ff;
+
+ continue;
+ }
+ }
+
report_untranslatable_char(encoding, PG_UTF8,
(const char *) (iso - l), len);
- if (p->utf & 0xff000000)
- *utf++ = p->utf >> 24;
- if (p->utf & 0x00ff0000)
- *utf++ = (p->utf & 0x00ff0000) >> 16;
- if (p->utf & 0x0000ff00)
- *utf++ = (p->utf & 0x0000ff00) >> 8;
- if (p->utf & 0x000000ff)
- *utf++ = p->utf & 0x000000ff;
+ }
+ else
+ {
+ if (p->utf & 0xff000000)
+ *utf++ = p->utf >> 24;
+ if (p->utf & 0x00ff0000)
+ *utf++ = (p->utf & 0x00ff0000) >> 16;
+ if (p->utf & 0x0000ff00)
+ *utf++ = (p->utf & 0x0000ff00) >> 8;
+ if (p->utf & 0x000000ff)
+ *utf++ = p->utf & 0x000000ff;
+ }
}
if (len > 0)