diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2011-04-23 12:35:41 -0400 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2011-04-23 12:35:41 -0400 |
commit | 2ab0796d7a3a7116a79b65531fd33f1548514b52 (patch) | |
tree | b327fa2ba27bef4dbd2dd287e23b5bf6ba8ac2ee /src/backend/utils/adt/pg_locale.c | |
parent | bb850306307d3d6ebb611c4039ae127236eb1699 (diff) | |
download | postgresql-2ab0796d7a3a7116a79b65531fd33f1548514b52.tar.gz postgresql-2ab0796d7a3a7116a79b65531fd33f1548514b52.zip |
Fix char2wchar/wchar2char to support collations properly.
These functions should take a pg_locale_t, not a collation OID, and should
call mbstowcs_l/wcstombs_l where available. Where those functions are not
available, temporarily select the correct locale with uselocale().
This change removes the bogus assumption that all locales selectable in
a given database have the same wide-character conversion method; in
particular, the collate.linux.utf8 regression test now passes with
LC_CTYPE=C, so long as the database encoding is UTF8.
I decided to move the char2wchar/wchar2char functions out of mbutils.c and
into pg_locale.c, because they work on wchar_t not pg_wchar_t and thus
don't really belong with the mbutils.c functions. Keeping them where they
were would have required importing pg_locale_t into pg_wchar.h somehow,
which did not seem like a good plan.
Diffstat (limited to 'src/backend/utils/adt/pg_locale.c')
-rw-r--r-- | src/backend/utils/adt/pg_locale.c | 173 |
1 files changed, 173 insertions, 0 deletions
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 0e6723d4690..8208d3cad9e 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1030,3 +1030,176 @@ pg_newlocale_from_collation(Oid collid) return cache_entry->locale; } + + +/* + * These functions convert from/to libc's wchar_t, *not* pg_wchar_t. + * Therefore we keep them here rather than with the mbutils code. + */ + +#ifdef USE_WIDE_UPPER_LOWER + +/* + * wchar2char --- convert wide characters to multibyte format + * + * This has the same API as the standard wcstombs_l() function; in particular, + * tolen is the maximum number of bytes to store at *to, and *from must be + * zero-terminated. The output will be zero-terminated iff there is room. + */ +size_t +wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale) +{ + size_t result; + + if (tolen == 0) + return 0; + +#ifdef WIN32 + + /* + * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and + * for some reason mbstowcs and wcstombs won't do this for us, so we use + * MultiByteToWideChar(). + */ + if (GetDatabaseEncoding() == PG_UTF8) + { + result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen, + NULL, NULL); + /* A zero return is failure */ + if (result <= 0) + result = -1; + else + { + Assert(result <= tolen); + /* Microsoft counts the zero terminator in the result */ + result--; + } + } + else +#endif /* WIN32 */ + if (locale == (pg_locale_t) 0) + { + /* Use wcstombs directly for the default locale */ + result = wcstombs(to, from, tolen); + } + else + { +#ifdef HAVE_LOCALE_T +#ifdef HAVE_WCSTOMBS_L + /* Use wcstombs_l for nondefault locales */ + result = wcstombs_l(to, from, tolen, locale); +#else /* !HAVE_WCSTOMBS_L */ + /* We have to temporarily set the locale as current ... ugh */ + locale_t save_locale = uselocale(locale); + + result = wcstombs(to, from, tolen); + + uselocale(save_locale); +#endif /* HAVE_WCSTOMBS_L */ +#else /* !HAVE_LOCALE_T */ + /* Can't have locale != 0 without HAVE_LOCALE_T */ + elog(ERROR, "wcstombs_l is not available"); + result = 0; /* keep compiler quiet */ +#endif /* HAVE_LOCALE_T */ + } + + return result; +} + +/* + * char2wchar --- convert multibyte characters to wide characters + * + * This has almost the API of mbstowcs_l(), except that *from need not be + * null-terminated; instead, the number of input bytes is specified as + * fromlen. Also, we ereport() rather than returning -1 for invalid + * input encoding. tolen is the maximum number of wchar_t's to store at *to. + * The output will be zero-terminated iff there is room. + */ +size_t +char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, + pg_locale_t locale) +{ + size_t result; + + if (tolen == 0) + return 0; + +#ifdef WIN32 + /* See WIN32 "Unicode" comment above */ + if (GetDatabaseEncoding() == PG_UTF8) + { + /* Win32 API does not work for zero-length input */ + if (fromlen == 0) + result = 0; + else + { + result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1); + /* A zero return is failure */ + if (result == 0) + result = -1; + } + + if (result != -1) + { + Assert(result < tolen); + /* Append trailing null wchar (MultiByteToWideChar() does not) */ + to[result] = 0; + } + } + else +#endif /* WIN32 */ + { + /* mbstowcs requires ending '\0' */ + char *str = pnstrdup(from, fromlen); + + if (locale == (pg_locale_t) 0) + { + /* Use mbstowcs directly for the default locale */ + result = mbstowcs(to, str, tolen); + } + else + { +#ifdef HAVE_LOCALE_T +#ifdef HAVE_WCSTOMBS_L + /* Use mbstowcs_l for nondefault locales */ + result = mbstowcs_l(to, str, tolen, locale); +#else /* !HAVE_WCSTOMBS_L */ + /* We have to temporarily set the locale as current ... ugh */ + locale_t save_locale = uselocale(locale); + + result = mbstowcs(to, str, tolen); + + uselocale(save_locale); +#endif /* HAVE_WCSTOMBS_L */ +#else /* !HAVE_LOCALE_T */ + /* Can't have locale != 0 without HAVE_LOCALE_T */ + elog(ERROR, "mbstowcs_l is not available"); + result = 0; /* keep compiler quiet */ +#endif /* HAVE_LOCALE_T */ + } + + pfree(str); + } + + if (result == -1) + { + /* + * Invalid multibyte character encountered. We try to give a useful + * error message by letting pg_verifymbstr check the string. But it's + * possible that the string is OK to us, and not OK to mbstowcs --- + * this suggests that the LC_CTYPE locale is different from the + * database encoding. Give a generic error message if verifymbstr + * can't find anything wrong. + */ + pg_verifymbstr(from, fromlen, false); /* might not return */ + /* but if it does ... */ + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid multibyte character for locale"), + errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding."))); + } + + return result; +} + +#endif /* USE_WIDE_UPPER_LOWER */ |