aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/pg_locale.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2011-04-23 12:35:41 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2011-04-23 12:35:41 -0400
commit2ab0796d7a3a7116a79b65531fd33f1548514b52 (patch)
treeb327fa2ba27bef4dbd2dd287e23b5bf6ba8ac2ee /src/backend/utils/adt/pg_locale.c
parentbb850306307d3d6ebb611c4039ae127236eb1699 (diff)
downloadpostgresql-2ab0796d7a3a7116a79b65531fd33f1548514b52.tar.gz
postgresql-2ab0796d7a3a7116a79b65531fd33f1548514b52.zip
Fix char2wchar/wchar2char to support collations properly.
These functions should take a pg_locale_t, not a collation OID, and should call mbstowcs_l/wcstombs_l where available. Where those functions are not available, temporarily select the correct locale with uselocale(). This change removes the bogus assumption that all locales selectable in a given database have the same wide-character conversion method; in particular, the collate.linux.utf8 regression test now passes with LC_CTYPE=C, so long as the database encoding is UTF8. I decided to move the char2wchar/wchar2char functions out of mbutils.c and into pg_locale.c, because they work on wchar_t not pg_wchar_t and thus don't really belong with the mbutils.c functions. Keeping them where they were would have required importing pg_locale_t into pg_wchar.h somehow, which did not seem like a good plan.
Diffstat (limited to 'src/backend/utils/adt/pg_locale.c')
-rw-r--r--src/backend/utils/adt/pg_locale.c173
1 files changed, 173 insertions, 0 deletions
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 0e6723d4690..8208d3cad9e 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1030,3 +1030,176 @@ pg_newlocale_from_collation(Oid collid)
return cache_entry->locale;
}
+
+
+/*
+ * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
+ * Therefore we keep them here rather than with the mbutils code.
+ */
+
+#ifdef USE_WIDE_UPPER_LOWER
+
+/*
+ * wchar2char --- convert wide characters to multibyte format
+ *
+ * This has the same API as the standard wcstombs_l() function; in particular,
+ * tolen is the maximum number of bytes to store at *to, and *from must be
+ * zero-terminated. The output will be zero-terminated iff there is room.
+ */
+size_t
+wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
+{
+ size_t result;
+
+ if (tolen == 0)
+ return 0;
+
+#ifdef WIN32
+
+ /*
+ * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
+ * for some reason mbstowcs and wcstombs won't do this for us, so we use
+ * MultiByteToWideChar().
+ */
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
+ NULL, NULL);
+ /* A zero return is failure */
+ if (result <= 0)
+ result = -1;
+ else
+ {
+ Assert(result <= tolen);
+ /* Microsoft counts the zero terminator in the result */
+ result--;
+ }
+ }
+ else
+#endif /* WIN32 */
+ if (locale == (pg_locale_t) 0)
+ {
+ /* Use wcstombs directly for the default locale */
+ result = wcstombs(to, from, tolen);
+ }
+ else
+ {
+#ifdef HAVE_LOCALE_T
+#ifdef HAVE_WCSTOMBS_L
+ /* Use wcstombs_l for nondefault locales */
+ result = wcstombs_l(to, from, tolen, locale);
+#else /* !HAVE_WCSTOMBS_L */
+ /* We have to temporarily set the locale as current ... ugh */
+ locale_t save_locale = uselocale(locale);
+
+ result = wcstombs(to, from, tolen);
+
+ uselocale(save_locale);
+#endif /* HAVE_WCSTOMBS_L */
+#else /* !HAVE_LOCALE_T */
+ /* Can't have locale != 0 without HAVE_LOCALE_T */
+ elog(ERROR, "wcstombs_l is not available");
+ result = 0; /* keep compiler quiet */
+#endif /* HAVE_LOCALE_T */
+ }
+
+ return result;
+}
+
+/*
+ * char2wchar --- convert multibyte characters to wide characters
+ *
+ * This has almost the API of mbstowcs_l(), except that *from need not be
+ * null-terminated; instead, the number of input bytes is specified as
+ * fromlen. Also, we ereport() rather than returning -1 for invalid
+ * input encoding. tolen is the maximum number of wchar_t's to store at *to.
+ * The output will be zero-terminated iff there is room.
+ */
+size_t
+char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
+ pg_locale_t locale)
+{
+ size_t result;
+
+ if (tolen == 0)
+ return 0;
+
+#ifdef WIN32
+ /* See WIN32 "Unicode" comment above */
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ /* Win32 API does not work for zero-length input */
+ if (fromlen == 0)
+ result = 0;
+ else
+ {
+ result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
+ /* A zero return is failure */
+ if (result == 0)
+ result = -1;
+ }
+
+ if (result != -1)
+ {
+ Assert(result < tolen);
+ /* Append trailing null wchar (MultiByteToWideChar() does not) */
+ to[result] = 0;
+ }
+ }
+ else
+#endif /* WIN32 */
+ {
+ /* mbstowcs requires ending '\0' */
+ char *str = pnstrdup(from, fromlen);
+
+ if (locale == (pg_locale_t) 0)
+ {
+ /* Use mbstowcs directly for the default locale */
+ result = mbstowcs(to, str, tolen);
+ }
+ else
+ {
+#ifdef HAVE_LOCALE_T
+#ifdef HAVE_WCSTOMBS_L
+ /* Use mbstowcs_l for nondefault locales */
+ result = mbstowcs_l(to, str, tolen, locale);
+#else /* !HAVE_WCSTOMBS_L */
+ /* We have to temporarily set the locale as current ... ugh */
+ locale_t save_locale = uselocale(locale);
+
+ result = mbstowcs(to, str, tolen);
+
+ uselocale(save_locale);
+#endif /* HAVE_WCSTOMBS_L */
+#else /* !HAVE_LOCALE_T */
+ /* Can't have locale != 0 without HAVE_LOCALE_T */
+ elog(ERROR, "mbstowcs_l is not available");
+ result = 0; /* keep compiler quiet */
+#endif /* HAVE_LOCALE_T */
+ }
+
+ pfree(str);
+ }
+
+ if (result == -1)
+ {
+ /*
+ * Invalid multibyte character encountered. We try to give a useful
+ * error message by letting pg_verifymbstr check the string. But it's
+ * possible that the string is OK to us, and not OK to mbstowcs ---
+ * this suggests that the LC_CTYPE locale is different from the
+ * database encoding. Give a generic error message if verifymbstr
+ * can't find anything wrong.
+ */
+ pg_verifymbstr(from, fromlen, false); /* might not return */
+ /* but if it does ... */
+ ereport(ERROR,
+ (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ errmsg("invalid multibyte character for locale"),
+ errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
+ }
+
+ return result;
+}
+
+#endif /* USE_WIDE_UPPER_LOWER */