diff options
author | Jeff Davis <jdavis@postgresql.org> | 2024-12-16 09:35:18 -0800 |
---|---|---|
committer | Jeff Davis <jdavis@postgresql.org> | 2024-12-16 09:35:18 -0800 |
commit | 86a5d6006aff956a5e00982b7628177fa7dc5027 (patch) | |
tree | 90edf4aeac487e9dba957a5fbb75f3d6ae9e58f6 | |
parent | de1e29885730851787b467449f525ff6fc7d69fa (diff) | |
download | postgresql-86a5d6006aff956a5e00982b7628177fa7dc5027.tar.gz postgresql-86a5d6006aff956a5e00982b7628177fa7dc5027.zip |
Refactor string case conversion into provider-specific files.
Create API entry points pg_strlower(), etc., that work with any
provider and give the caller control over the destination
buffer. Then, move provider-specific logic into pg_locale_builtin.c,
pg_locale_icu.c, and pg_locale_libc.c as appropriate.
Discussion: https://postgr.es/m/7aa46d77b377428058403723440862d12a8a129a.camel@j-davis.com
-rw-r--r-- | src/backend/utils/adt/formatting.c | 465 | ||||
-rw-r--r-- | src/backend/utils/adt/pg_locale.c | 78 | ||||
-rw-r--r-- | src/backend/utils/adt/pg_locale_builtin.c | 80 | ||||
-rw-r--r-- | src/backend/utils/adt/pg_locale_icu.c | 130 | ||||
-rw-r--r-- | src/backend/utils/adt/pg_locale_libc.c | 327 | ||||
-rw-r--r-- | src/include/utils/pg_locale.h | 14 |
6 files changed, 676 insertions, 418 deletions
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 0dcb5515119..30c06c8d099 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1571,52 +1571,6 @@ str_numth(char *dest, char *num, int type) * upper/lower/initcap functions *****************************************************************************/ -#ifdef USE_ICU - -typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - const char *locale, - UErrorCode *pErrorCode); - -static int32_t -icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale, - UChar **buff_dest, UChar *buff_source, int32_t len_source) -{ - UErrorCode status; - int32_t len_dest; - - len_dest = len_source; /* try first with same length */ - *buff_dest = palloc(len_dest * sizeof(**buff_dest)); - status = U_ZERO_ERROR; - len_dest = func(*buff_dest, len_dest, buff_source, len_source, - mylocale->info.icu.locale, &status); - if (status == U_BUFFER_OVERFLOW_ERROR) - { - /* try again with adjusted length */ - pfree(*buff_dest); - *buff_dest = palloc(len_dest * sizeof(**buff_dest)); - status = U_ZERO_ERROR; - len_dest = func(*buff_dest, len_dest, buff_source, len_source, - mylocale->info.icu.locale, &status); - } - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("case conversion failed: %s", u_errorName(status)))); - return len_dest; -} - -static int32_t -u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - const char *locale, - UErrorCode *pErrorCode) -{ - return u_strToTitle(dest, destCapacity, src, srcLength, - NULL, locale, pErrorCode); -} - -#endif /* USE_ICU */ - /* * If the system provides the needed functions for wide-character manipulation * (which are all standardized by C99), then we implement upper/lower/initcap @@ -1664,106 +1618,28 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) } else { -#ifdef USE_ICU - if (mylocale->provider == COLLPROVIDER_ICU) - { - int32_t len_uchar; - int32_t len_conv; - UChar *buff_uchar; - UChar *buff_conv; - - len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes); - len_conv = icu_convert_case(u_strToLower, mylocale, - &buff_conv, buff_uchar, len_uchar); - icu_from_uchar(&result, buff_conv, len_conv); - pfree(buff_uchar); - pfree(buff_conv); - } - else -#endif - if (mylocale->provider == COLLPROVIDER_BUILTIN) + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize; + char *dst; + size_t needed; + + /* first try buffer of equal size plus terminating NUL */ + dstsize = srclen + 1; + dst = palloc(dstsize); + + needed = pg_strlower(dst, dstsize, src, srclen, mylocale); + if (needed + 1 > dstsize) { - const char *src = buff; - size_t srclen = nbytes; - size_t dstsize; - char *dst; - size_t needed; - - Assert(GetDatabaseEncoding() == PG_UTF8); - - /* first try buffer of equal size plus terminating NUL */ - dstsize = srclen + 1; - dst = palloc(dstsize); - - needed = unicode_strlower(dst, dstsize, src, srclen); - if (needed + 1 > dstsize) - { - /* grow buffer if needed and retry */ - dstsize = needed + 1; - dst = repalloc(dst, dstsize); - needed = unicode_strlower(dst, dstsize, src, srclen); - Assert(needed + 1 == dstsize); - } - - Assert(dst[needed] == '\0'); - result = dst; + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = pg_strlower(dst, dstsize, src, srclen, mylocale); + Assert(needed + 1 <= dstsize); } - else - { - Assert(mylocale->provider == COLLPROVIDER_LIBC); - - if (pg_database_encoding_max_length() > 1) - { - wchar_t *workspace; - size_t curr_char; - size_t result_size; - - /* Overflow paranoia */ - if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t))) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - - /* Output workspace cannot have more codes than input bytes */ - workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); - - char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale); - - for (curr_char = 0; workspace[curr_char] != 0; curr_char++) - workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt); - - /* - * Make result large enough; case change might change number - * of bytes - */ - result_size = curr_char * pg_database_encoding_max_length() + 1; - result = palloc(result_size); - wchar2char(result, workspace, result_size, mylocale); - pfree(workspace); - } - else - { - char *p; - - result = pnstrdup(buff, nbytes); - - /* - * Note: we assume that tolower_l() will not be so broken as - * to need an isupper_l() guard test. When using the default - * collation, we apply the traditional Postgres behavior that - * forces ASCII-style treatment of I/i, but in non-default - * collations you get exactly what the collation says. - */ - for (p = result; *p; p++) - { - if (mylocale->is_default) - *p = pg_tolower((unsigned char) *p); - else - *p = tolower_l((unsigned char) *p, mylocale->info.lt); - } - } - } + Assert(dst[needed] == '\0'); + result = dst; } return result; @@ -1806,152 +1682,33 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) } else { -#ifdef USE_ICU - if (mylocale->provider == COLLPROVIDER_ICU) + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize; + char *dst; + size_t needed; + + /* first try buffer of equal size plus terminating NUL */ + dstsize = srclen + 1; + dst = palloc(dstsize); + + needed = pg_strupper(dst, dstsize, src, srclen, mylocale); + if (needed + 1 > dstsize) { - int32_t len_uchar, - len_conv; - UChar *buff_uchar; - UChar *buff_conv; - - len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes); - len_conv = icu_convert_case(u_strToUpper, mylocale, - &buff_conv, buff_uchar, len_uchar); - icu_from_uchar(&result, buff_conv, len_conv); - pfree(buff_uchar); - pfree(buff_conv); + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = pg_strupper(dst, dstsize, src, srclen, mylocale); + Assert(needed + 1 <= dstsize); } - else -#endif - if (mylocale->provider == COLLPROVIDER_BUILTIN) - { - const char *src = buff; - size_t srclen = nbytes; - size_t dstsize; - char *dst; - size_t needed; - - Assert(GetDatabaseEncoding() == PG_UTF8); - - /* first try buffer of equal size plus terminating NUL */ - dstsize = srclen + 1; - dst = palloc(dstsize); - - needed = unicode_strupper(dst, dstsize, src, srclen); - if (needed + 1 > dstsize) - { - /* grow buffer if needed and retry */ - dstsize = needed + 1; - dst = repalloc(dst, dstsize); - needed = unicode_strupper(dst, dstsize, src, srclen); - Assert(needed + 1 == dstsize); - } - - Assert(dst[needed] == '\0'); - result = dst; - } - else - { - Assert(mylocale->provider == COLLPROVIDER_LIBC); - - if (pg_database_encoding_max_length() > 1) - { - wchar_t *workspace; - size_t curr_char; - size_t result_size; - - /* Overflow paranoia */ - if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t))) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - - /* Output workspace cannot have more codes than input bytes */ - workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); - - char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale); - - for (curr_char = 0; workspace[curr_char] != 0; curr_char++) - workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt); - - /* - * Make result large enough; case change might change number - * of bytes - */ - result_size = curr_char * pg_database_encoding_max_length() + 1; - result = palloc(result_size); - wchar2char(result, workspace, result_size, mylocale); - pfree(workspace); - } - else - { - char *p; - - result = pnstrdup(buff, nbytes); - - /* - * Note: we assume that toupper_l() will not be so broken as - * to need an islower_l() guard test. When using the default - * collation, we apply the traditional Postgres behavior that - * forces ASCII-style treatment of I/i, but in non-default - * collations you get exactly what the collation says. - */ - for (p = result; *p; p++) - { - if (mylocale->is_default) - *p = pg_toupper((unsigned char) *p); - else - *p = toupper_l((unsigned char) *p, mylocale->info.lt); - } - } - } + Assert(dst[needed] == '\0'); + result = dst; } return result; } -struct WordBoundaryState -{ - const char *str; - size_t len; - size_t offset; - bool init; - bool prev_alnum; -}; - -/* - * Simple word boundary iterator that draws boundaries each time the result of - * pg_u_isalnum() changes. - */ -static size_t -initcap_wbnext(void *state) -{ - struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state; - - while (wbstate->offset < wbstate->len && - wbstate->str[wbstate->offset] != '\0') - { - pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str + - wbstate->offset); - bool curr_alnum = pg_u_isalnum(u, true); - - if (!wbstate->init || curr_alnum != wbstate->prev_alnum) - { - size_t prev_offset = wbstate->offset; - - wbstate->init = true; - wbstate->offset += unicode_utf8len(u); - wbstate->prev_alnum = curr_alnum; - return prev_offset; - } - - wbstate->offset += unicode_utf8len(u); - } - - return wbstate->len; -} - /* * collation-aware, wide-character-aware initcap function * @@ -1962,7 +1719,6 @@ char * str_initcap(const char *buff, size_t nbytes, Oid collid) { char *result; - int wasalnum = false; pg_locale_t mylocale; if (!buff) @@ -1990,135 +1746,28 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) } else { -#ifdef USE_ICU - if (mylocale->provider == COLLPROVIDER_ICU) + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize; + char *dst; + size_t needed; + + /* first try buffer of equal size plus terminating NUL */ + dstsize = srclen + 1; + dst = palloc(dstsize); + + needed = pg_strtitle(dst, dstsize, src, srclen, mylocale); + if (needed + 1 > dstsize) { - int32_t len_uchar, - len_conv; - UChar *buff_uchar; - UChar *buff_conv; - - len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes); - len_conv = icu_convert_case(u_strToTitle_default_BI, mylocale, - &buff_conv, buff_uchar, len_uchar); - icu_from_uchar(&result, buff_conv, len_conv); - pfree(buff_uchar); - pfree(buff_conv); + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = pg_strtitle(dst, dstsize, src, srclen, mylocale); + Assert(needed + 1 <= dstsize); } - else -#endif - if (mylocale->provider == COLLPROVIDER_BUILTIN) - { - const char *src = buff; - size_t srclen = nbytes; - size_t dstsize; - char *dst; - size_t needed; - struct WordBoundaryState wbstate = { - .str = src, - .len = srclen, - .offset = 0, - .init = false, - .prev_alnum = false, - }; - - Assert(GetDatabaseEncoding() == PG_UTF8); - - /* first try buffer of equal size plus terminating NUL */ - dstsize = srclen + 1; - dst = palloc(dstsize); - - needed = unicode_strtitle(dst, dstsize, src, srclen, - initcap_wbnext, &wbstate); - if (needed + 1 > dstsize) - { - /* reset iterator */ - wbstate.offset = 0; - wbstate.init = false; - - /* grow buffer if needed and retry */ - dstsize = needed + 1; - dst = repalloc(dst, dstsize); - needed = unicode_strtitle(dst, dstsize, src, srclen, - initcap_wbnext, &wbstate); - Assert(needed + 1 == dstsize); - } - result = dst; - } - else - { - Assert(mylocale->provider == COLLPROVIDER_LIBC); - - if (pg_database_encoding_max_length() > 1) - { - wchar_t *workspace; - size_t curr_char; - size_t result_size; - - /* Overflow paranoia */ - if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t))) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - - /* Output workspace cannot have more codes than input bytes */ - workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); - - char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale); - - for (curr_char = 0; workspace[curr_char] != 0; curr_char++) - { - if (wasalnum) - workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt); - else - workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt); - wasalnum = iswalnum_l(workspace[curr_char], mylocale->info.lt); - } - - /* - * Make result large enough; case change might change number - * of bytes - */ - result_size = curr_char * pg_database_encoding_max_length() + 1; - result = palloc(result_size); - - wchar2char(result, workspace, result_size, mylocale); - pfree(workspace); - } - else - { - char *p; - - result = pnstrdup(buff, nbytes); - - /* - * Note: we assume that toupper_l()/tolower_l() will not be so - * broken as to need guard tests. When using the default - * collation, we apply the traditional Postgres behavior that - * forces ASCII-style treatment of I/i, but in non-default - * collations you get exactly what the collation says. - */ - for (p = result; *p; p++) - { - if (mylocale->is_default) - { - if (wasalnum) - *p = pg_tolower((unsigned char) *p); - else - *p = pg_toupper((unsigned char) *p); - } - else - { - if (wasalnum) - *p = tolower_l((unsigned char) *p, mylocale->info.lt); - else - *p = toupper_l((unsigned char) *p, mylocale->info.lt); - } - wasalnum = isalnum_l((unsigned char) *p, mylocale->info.lt); - } - } - } + Assert(dst[needed] == '\0'); + result = dst; } return result; diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 4cb56126e97..d16f26f1705 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -116,6 +116,27 @@ extern size_t strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); +extern size_t strlower_builtin(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); + +extern size_t strlower_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +extern size_t strupper_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); + +extern size_t strlower_libc(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +extern size_t strtitle_libc(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +extern size_t strupper_libc(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); + /* GUC settings */ char *locale_messages; char *locale_monetary; @@ -1468,6 +1489,63 @@ get_collation_actual_version(char collprovider, const char *collcollate) return collversion; } +size_t +pg_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + if (locale->provider == COLLPROVIDER_BUILTIN) + return strlower_builtin(dst, dstsize, src, srclen, locale); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + return strlower_icu(dst, dstsize, src, srclen, locale); +#endif + else if (locale->provider == COLLPROVIDER_LIBC) + return strlower_libc(dst, dstsize, src, srclen, locale); + else + /* shouldn't happen */ + PGLOCALE_SUPPORT_ERROR(locale->provider); + + return 0; /* keep compiler quiet */ +} + +size_t +pg_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + if (locale->provider == COLLPROVIDER_BUILTIN) + return strtitle_builtin(dst, dstsize, src, srclen, locale); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + return strtitle_icu(dst, dstsize, src, srclen, locale); +#endif + else if (locale->provider == COLLPROVIDER_LIBC) + return strtitle_libc(dst, dstsize, src, srclen, locale); + else + /* shouldn't happen */ + PGLOCALE_SUPPORT_ERROR(locale->provider); + + return 0; /* keep compiler quiet */ +} + +size_t +pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + if (locale->provider == COLLPROVIDER_BUILTIN) + return strupper_builtin(dst, dstsize, src, srclen, locale); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + return strupper_icu(dst, dstsize, src, srclen, locale); +#endif + else if (locale->provider == COLLPROVIDER_LIBC) + return strupper_libc(dst, dstsize, src, srclen, locale); + else + /* shouldn't happen */ + PGLOCALE_SUPPORT_ERROR(locale->provider); + + return 0; /* keep compiler quiet */ +} + /* * pg_strcoll * diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 4246971a4d8..d3aa7bceacd 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -13,6 +13,8 @@ #include "catalog/pg_database.h" #include "catalog/pg_collation.h" +#include "common/unicode_case.h" +#include "common/unicode_category.h" #include "mb/pg_wchar.h" #include "miscadmin.h" #include "utils/builtins.h" @@ -22,6 +24,84 @@ extern pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context); +extern size_t strlower_builtin(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); + + +struct WordBoundaryState +{ + const char *str; + size_t len; + size_t offset; + bool init; + bool prev_alnum; +}; + +/* + * Simple word boundary iterator that draws boundaries each time the result of + * pg_u_isalnum() changes. + */ +static size_t +initcap_wbnext(void *state) +{ + struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state; + + while (wbstate->offset < wbstate->len && + wbstate->str[wbstate->offset] != '\0') + { + pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str + + wbstate->offset); + bool curr_alnum = pg_u_isalnum(u, true); + + if (!wbstate->init || curr_alnum != wbstate->prev_alnum) + { + size_t prev_offset = wbstate->offset; + + wbstate->init = true; + wbstate->offset += unicode_utf8len(u); + wbstate->prev_alnum = curr_alnum; + return prev_offset; + } + + wbstate->offset += unicode_utf8len(u); + } + + return wbstate->len; +} + +size_t +strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + return unicode_strlower(dest, destsize, src, srclen); +} + +size_t +strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + struct WordBoundaryState wbstate = { + .str = src, + .len = srclen, + .offset = 0, + .init = false, + .prev_alnum = false, + }; + + return unicode_strtitle(dest, destsize, src, srclen, + initcap_wbnext, &wbstate); +} + +size_t +strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + return unicode_strupper(dest, destsize, src, srclen); +} pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context) diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index 2c6b950ec18..f0a77a767e7 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -48,6 +48,12 @@ #define TEXTBUFLEN 1024 extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context); +extern size_t strlower_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +extern size_t strupper_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); #ifdef USE_ICU @@ -62,6 +68,11 @@ extern size_t strnxfrm_prefix_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); +typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + const char *locale, + UErrorCode *pErrorCode); + /* * Converter object for converting between ICU's UChar strings and C strings * in database encoding. Since the database encoding doesn't change, we only @@ -83,8 +94,19 @@ static size_t uchar_length(UConverter *converter, static int32_t uchar_convert(UConverter *converter, UChar *dest, int32_t destlen, const char *src, int32_t srclen); +static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, + size_t nbytes); +static size_t icu_from_uchar(char *dest, size_t destsize, + const UChar *buff_uchar, int32_t len_uchar); static void icu_set_collation_attributes(UCollator *collator, const char *loc, UErrorCode *status); +static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale, + UChar **buff_dest, UChar *buff_source, + int32_t len_source); +static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + const char *locale, + UErrorCode *pErrorCode); #endif pg_locale_t @@ -324,6 +346,66 @@ make_icu_collator(const char *iculocstr, const char *icurules) } } +size_t +strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + int32_t len_uchar; + int32_t len_conv; + UChar *buff_uchar; + UChar *buff_conv; + size_t result_len; + + len_uchar = icu_to_uchar(&buff_uchar, src, srclen); + len_conv = icu_convert_case(u_strToLower, locale, + &buff_conv, buff_uchar, len_uchar); + result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); + pfree(buff_uchar); + pfree(buff_conv); + + return result_len; +} + +size_t +strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + int32_t len_uchar; + int32_t len_conv; + UChar *buff_uchar; + UChar *buff_conv; + size_t result_len; + + len_uchar = icu_to_uchar(&buff_uchar, src, srclen); + len_conv = icu_convert_case(u_strToTitle_default_BI, locale, + &buff_conv, buff_uchar, len_uchar); + result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); + pfree(buff_uchar); + pfree(buff_conv); + + return result_len; +} + +size_t +strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + int32_t len_uchar; + int32_t len_conv; + UChar *buff_uchar; + UChar *buff_conv; + size_t result_len; + + len_uchar = icu_to_uchar(&buff_uchar, src, srclen); + len_conv = icu_convert_case(u_strToUpper, locale, + &buff_conv, buff_uchar, len_uchar); + result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); + pfree(buff_uchar); + pfree(buff_conv); + + return result_len; +} + /* * strncoll_icu * @@ -458,7 +540,7 @@ strnxfrm_prefix_icu(char *dest, size_t destsize, * The result string is nul-terminated, though most callers rely on the * result length instead. */ -int32_t +static int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes) { int32_t len_uchar; @@ -485,8 +567,8 @@ icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes) * * The result string is nul-terminated. */ -int32_t -icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar) +static size_t +icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len_uchar) { UErrorCode status; int32_t len_result; @@ -501,10 +583,11 @@ icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar) (errmsg("%s failed: %s", "ucnv_fromUChars", u_errorName(status)))); - *result = palloc(len_result + 1); + if (len_result + 1 > destsize) + return len_result; status = U_ZERO_ERROR; - len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1, + len_result = ucnv_fromUChars(icu_converter, dest, len_result + 1, buff_uchar, len_uchar, &status); if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) @@ -515,6 +598,43 @@ icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar) return len_result; } +static int32_t +icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale, + UChar **buff_dest, UChar *buff_source, int32_t len_source) +{ + UErrorCode status; + int32_t len_dest; + + len_dest = len_source; /* try first with same length */ + *buff_dest = palloc(len_dest * sizeof(**buff_dest)); + status = U_ZERO_ERROR; + len_dest = func(*buff_dest, len_dest, buff_source, len_source, + mylocale->info.icu.locale, &status); + if (status == U_BUFFER_OVERFLOW_ERROR) + { + /* try again with adjusted length */ + pfree(*buff_dest); + *buff_dest = palloc(len_dest * sizeof(**buff_dest)); + status = U_ZERO_ERROR; + len_dest = func(*buff_dest, len_dest, buff_source, len_source, + mylocale->info.icu.locale, &status); + } + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("case conversion failed: %s", u_errorName(status)))); + return len_dest; +} + +static int32_t +u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + const char *locale, + UErrorCode *pErrorCode) +{ + return u_strToTitle(dest, destCapacity, src, srcLength, + NULL, locale, pErrorCode); +} + /* * strncoll_icu_no_utf8 * diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 374ac37ba0a..97ca5a28e66 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -11,6 +11,9 @@ #include "postgres.h" +#include <limits.h> +#include <wctype.h> + #include "access/htup_details.h" #include "catalog/pg_database.h" #include "catalog/pg_collation.h" @@ -32,6 +35,13 @@ extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context); +extern size_t strlower_libc(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +extern size_t strtitle_libc(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); +extern size_t strupper_libc(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); + extern int strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale); @@ -48,6 +58,323 @@ static int strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, pg_locale_t locale); #endif +static size_t strlower_libc_sb(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); +static size_t strlower_libc_mb(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); +static size_t strtitle_libc_sb(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); +static size_t strtitle_libc_mb(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); +static size_t strupper_libc_sb(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); +static size_t strupper_libc_mb(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + +size_t +strlower_libc(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale) +{ + if (pg_database_encoding_max_length() > 1) + return strlower_libc_mb(dst, dstsize, src, srclen, locale); + else + return strlower_libc_sb(dst, dstsize, src, srclen, locale); +} + +size_t +strtitle_libc(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale) +{ + if (pg_database_encoding_max_length() > 1) + return strtitle_libc_mb(dst, dstsize, src, srclen, locale); + else + return strtitle_libc_sb(dst, dstsize, src, srclen, locale); +} + +size_t +strupper_libc(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale) +{ + if (pg_database_encoding_max_length() > 1) + return strupper_libc_mb(dst, dstsize, src, srclen, locale); + else + return strupper_libc_sb(dst, dstsize, src, srclen, locale); +} + +static size_t +strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + if (srclen < 0) + srclen = strlen(src); + + if (srclen + 1 <= destsize) + { + locale_t loc = locale->info.lt; + char *p; + + if (srclen + 1 > destsize) + return srclen; + + memcpy(dest, src, srclen); + dest[srclen] = '\0'; + + /* + * Note: we assume that tolower_l() will not be so broken as to need + * an isupper_l() guard test. When using the default collation, we + * apply the traditional Postgres behavior that forces ASCII-style + * treatment of I/i, but in non-default collations you get exactly + * what the collation says. + */ + for (p = dest; *p; p++) + { + if (locale->is_default) + *p = pg_tolower((unsigned char) *p); + else + *p = tolower_l((unsigned char) *p, loc); + } + } + + return srclen; +} + +static size_t +strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + locale_t loc = locale->info.lt; + size_t result_size; + wchar_t *workspace; + char *result; + size_t curr_char; + size_t max_size; + + if (srclen < 0) + srclen = strlen(src); + + /* Overflow paranoia */ + if ((srclen + 1) > (INT_MAX / sizeof(wchar_t))) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + /* Output workspace cannot have more codes than input bytes */ + workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t)); + + char2wchar(workspace, srclen + 1, src, srclen, locale); + + for (curr_char = 0; workspace[curr_char] != 0; curr_char++) + workspace[curr_char] = towlower_l(workspace[curr_char], loc); + + /* + * Make result large enough; case change might change number of bytes + */ + max_size = curr_char * pg_database_encoding_max_length(); + result = palloc(max_size + 1); + + result_size = wchar2char(result, workspace, max_size + 1, locale); + + if (result_size + 1 > destsize) + return result_size; + + memcpy(dest, result, result_size); + dest[result_size] = '\0'; + + pfree(workspace); + pfree(result); + + return result_size; +} + +static size_t +strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + if (srclen < 0) + srclen = strlen(src); + + if (srclen + 1 <= destsize) + { + locale_t loc = locale->info.lt; + int wasalnum = false; + char *p; + + memcpy(dest, src, srclen); + dest[srclen] = '\0'; + + /* + * Note: we assume that toupper_l()/tolower_l() will not be so broken + * as to need guard tests. When using the default collation, we apply + * the traditional Postgres behavior that forces ASCII-style treatment + * of I/i, but in non-default collations you get exactly what the + * collation says. + */ + for (p = dest; *p; p++) + { + if (locale->is_default) + { + if (wasalnum) + *p = pg_tolower((unsigned char) *p); + else + *p = pg_toupper((unsigned char) *p); + } + else + { + if (wasalnum) + *p = tolower_l((unsigned char) *p, loc); + else + *p = toupper_l((unsigned char) *p, loc); + } + wasalnum = isalnum_l((unsigned char) *p, loc); + } + } + + return srclen; +} + +static size_t +strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + locale_t loc = locale->info.lt; + int wasalnum = false; + size_t result_size; + wchar_t *workspace; + char *result; + size_t curr_char; + size_t max_size; + + if (srclen < 0) + srclen = strlen(src); + + /* Overflow paranoia */ + if ((srclen + 1) > (INT_MAX / sizeof(wchar_t))) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + /* Output workspace cannot have more codes than input bytes */ + workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t)); + + char2wchar(workspace, srclen + 1, src, srclen, locale); + + for (curr_char = 0; workspace[curr_char] != 0; curr_char++) + { + if (wasalnum) + workspace[curr_char] = towlower_l(workspace[curr_char], loc); + else + workspace[curr_char] = towupper_l(workspace[curr_char], loc); + wasalnum = iswalnum_l(workspace[curr_char], loc); + } + + /* + * Make result large enough; case change might change number of bytes + */ + max_size = curr_char * pg_database_encoding_max_length(); + result = palloc(max_size + 1); + + result_size = wchar2char(result, workspace, max_size + 1, locale); + + if (result_size + 1 > destsize) + return result_size; + + memcpy(dest, result, result_size); + dest[result_size] = '\0'; + + pfree(workspace); + pfree(result); + + return result_size; +} + +static size_t +strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + if (srclen < 0) + srclen = strlen(src); + + if (srclen + 1 <= destsize) + { + locale_t loc = locale->info.lt; + char *p; + + memcpy(dest, src, srclen); + dest[srclen] = '\0'; + + /* + * Note: we assume that toupper_l() will not be so broken as to need + * an islower_l() guard test. When using the default collation, we + * apply the traditional Postgres behavior that forces ASCII-style + * treatment of I/i, but in non-default collations you get exactly + * what the collation says. + */ + for (p = dest; *p; p++) + { + if (locale->is_default) + *p = pg_toupper((unsigned char) *p); + else + *p = toupper_l((unsigned char) *p, loc); + } + } + + return srclen; +} + +static size_t +strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + locale_t loc = locale->info.lt; + size_t result_size; + wchar_t *workspace; + char *result; + size_t curr_char; + size_t max_size; + + if (srclen < 0) + srclen = strlen(src); + + /* Overflow paranoia */ + if ((srclen + 1) > (INT_MAX / sizeof(wchar_t))) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + /* Output workspace cannot have more codes than input bytes */ + workspace = (wchar_t *) palloc((srclen + 1) * sizeof(wchar_t)); + + char2wchar(workspace, srclen + 1, src, srclen, locale); + + for (curr_char = 0; workspace[curr_char] != 0; curr_char++) + workspace[curr_char] = towupper_l(workspace[curr_char], loc); + + /* + * Make result large enough; case change might change number of bytes + */ + max_size = curr_char * pg_database_encoding_max_length(); + result = palloc(max_size + 1); + + result_size = wchar2char(result, workspace, max_size + 1, locale); + + if (result_size + 1 > destsize) + return result_size; + + memcpy(dest, result, result_size); + dest[result_size] = '\0'; + + pfree(workspace); + pfree(result); + + return result_size; +} + pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context) { diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 776f8f6f2fe..861df3ddd05 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -93,6 +93,15 @@ extern void init_database_collation(void); extern pg_locale_t pg_newlocale_from_collation(Oid collid); extern char *get_collation_actual_version(char collprovider, const char *collcollate); +extern size_t pg_strlower(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); +extern size_t pg_strtitle(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); +extern size_t pg_strupper(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale); extern int pg_strncoll(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale); @@ -112,11 +121,6 @@ extern const char *builtin_validate_locale(int encoding, const char *locale); extern void icu_validate_locale(const char *loc_str); extern char *icu_language_tag(const char *loc_str, int elevel); -#ifdef USE_ICU -extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes); -extern int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar); -#endif - /* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */ extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale); |