diff options
author | Jeff Davis <jdavis@postgresql.org> | 2024-12-16 09:35:18 -0800 |
---|---|---|
committer | Jeff Davis <jdavis@postgresql.org> | 2024-12-16 09:35:18 -0800 |
commit | 86a5d6006aff956a5e00982b7628177fa7dc5027 (patch) | |
tree | 90edf4aeac487e9dba957a5fbb75f3d6ae9e58f6 /src/backend/utils/adt/formatting.c | |
parent | de1e29885730851787b467449f525ff6fc7d69fa (diff) | |
download | postgresql-86a5d6006aff956a5e00982b7628177fa7dc5027.tar.gz postgresql-86a5d6006aff956a5e00982b7628177fa7dc5027.zip |
Refactor string case conversion into provider-specific files.
Create API entry points pg_strlower(), etc., that work with any
provider and give the caller control over the destination
buffer. Then, move provider-specific logic into pg_locale_builtin.c,
pg_locale_icu.c, and pg_locale_libc.c as appropriate.
Discussion: https://postgr.es/m/7aa46d77b377428058403723440862d12a8a129a.camel@j-davis.com
Diffstat (limited to 'src/backend/utils/adt/formatting.c')
-rw-r--r-- | src/backend/utils/adt/formatting.c | 465 |
1 files changed, 57 insertions, 408 deletions
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 0dcb5515119..30c06c8d099 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1571,52 +1571,6 @@ str_numth(char *dest, char *num, int type) * upper/lower/initcap functions *****************************************************************************/ -#ifdef USE_ICU - -typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - const char *locale, - UErrorCode *pErrorCode); - -static int32_t -icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale, - UChar **buff_dest, UChar *buff_source, int32_t len_source) -{ - UErrorCode status; - int32_t len_dest; - - len_dest = len_source; /* try first with same length */ - *buff_dest = palloc(len_dest * sizeof(**buff_dest)); - status = U_ZERO_ERROR; - len_dest = func(*buff_dest, len_dest, buff_source, len_source, - mylocale->info.icu.locale, &status); - if (status == U_BUFFER_OVERFLOW_ERROR) - { - /* try again with adjusted length */ - pfree(*buff_dest); - *buff_dest = palloc(len_dest * sizeof(**buff_dest)); - status = U_ZERO_ERROR; - len_dest = func(*buff_dest, len_dest, buff_source, len_source, - mylocale->info.icu.locale, &status); - } - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("case conversion failed: %s", u_errorName(status)))); - return len_dest; -} - -static int32_t -u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, - const UChar *src, int32_t srcLength, - const char *locale, - UErrorCode *pErrorCode) -{ - return u_strToTitle(dest, destCapacity, src, srcLength, - NULL, locale, pErrorCode); -} - -#endif /* USE_ICU */ - /* * If the system provides the needed functions for wide-character manipulation * (which are all standardized by C99), then we implement upper/lower/initcap @@ -1664,106 +1618,28 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) } else { -#ifdef USE_ICU - if (mylocale->provider == COLLPROVIDER_ICU) - { - int32_t len_uchar; - int32_t len_conv; - UChar *buff_uchar; - UChar *buff_conv; - - len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes); - len_conv = icu_convert_case(u_strToLower, mylocale, - &buff_conv, buff_uchar, len_uchar); - icu_from_uchar(&result, buff_conv, len_conv); - pfree(buff_uchar); - pfree(buff_conv); - } - else -#endif - if (mylocale->provider == COLLPROVIDER_BUILTIN) + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize; + char *dst; + size_t needed; + + /* first try buffer of equal size plus terminating NUL */ + dstsize = srclen + 1; + dst = palloc(dstsize); + + needed = pg_strlower(dst, dstsize, src, srclen, mylocale); + if (needed + 1 > dstsize) { - const char *src = buff; - size_t srclen = nbytes; - size_t dstsize; - char *dst; - size_t needed; - - Assert(GetDatabaseEncoding() == PG_UTF8); - - /* first try buffer of equal size plus terminating NUL */ - dstsize = srclen + 1; - dst = palloc(dstsize); - - needed = unicode_strlower(dst, dstsize, src, srclen); - if (needed + 1 > dstsize) - { - /* grow buffer if needed and retry */ - dstsize = needed + 1; - dst = repalloc(dst, dstsize); - needed = unicode_strlower(dst, dstsize, src, srclen); - Assert(needed + 1 == dstsize); - } - - Assert(dst[needed] == '\0'); - result = dst; + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = pg_strlower(dst, dstsize, src, srclen, mylocale); + Assert(needed + 1 <= dstsize); } - else - { - Assert(mylocale->provider == COLLPROVIDER_LIBC); - - if (pg_database_encoding_max_length() > 1) - { - wchar_t *workspace; - size_t curr_char; - size_t result_size; - - /* Overflow paranoia */ - if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t))) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - - /* Output workspace cannot have more codes than input bytes */ - workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); - - char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale); - - for (curr_char = 0; workspace[curr_char] != 0; curr_char++) - workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt); - - /* - * Make result large enough; case change might change number - * of bytes - */ - result_size = curr_char * pg_database_encoding_max_length() + 1; - result = palloc(result_size); - wchar2char(result, workspace, result_size, mylocale); - pfree(workspace); - } - else - { - char *p; - - result = pnstrdup(buff, nbytes); - - /* - * Note: we assume that tolower_l() will not be so broken as - * to need an isupper_l() guard test. When using the default - * collation, we apply the traditional Postgres behavior that - * forces ASCII-style treatment of I/i, but in non-default - * collations you get exactly what the collation says. - */ - for (p = result; *p; p++) - { - if (mylocale->is_default) - *p = pg_tolower((unsigned char) *p); - else - *p = tolower_l((unsigned char) *p, mylocale->info.lt); - } - } - } + Assert(dst[needed] == '\0'); + result = dst; } return result; @@ -1806,152 +1682,33 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) } else { -#ifdef USE_ICU - if (mylocale->provider == COLLPROVIDER_ICU) + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize; + char *dst; + size_t needed; + + /* first try buffer of equal size plus terminating NUL */ + dstsize = srclen + 1; + dst = palloc(dstsize); + + needed = pg_strupper(dst, dstsize, src, srclen, mylocale); + if (needed + 1 > dstsize) { - int32_t len_uchar, - len_conv; - UChar *buff_uchar; - UChar *buff_conv; - - len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes); - len_conv = icu_convert_case(u_strToUpper, mylocale, - &buff_conv, buff_uchar, len_uchar); - icu_from_uchar(&result, buff_conv, len_conv); - pfree(buff_uchar); - pfree(buff_conv); + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = pg_strupper(dst, dstsize, src, srclen, mylocale); + Assert(needed + 1 <= dstsize); } - else -#endif - if (mylocale->provider == COLLPROVIDER_BUILTIN) - { - const char *src = buff; - size_t srclen = nbytes; - size_t dstsize; - char *dst; - size_t needed; - - Assert(GetDatabaseEncoding() == PG_UTF8); - - /* first try buffer of equal size plus terminating NUL */ - dstsize = srclen + 1; - dst = palloc(dstsize); - - needed = unicode_strupper(dst, dstsize, src, srclen); - if (needed + 1 > dstsize) - { - /* grow buffer if needed and retry */ - dstsize = needed + 1; - dst = repalloc(dst, dstsize); - needed = unicode_strupper(dst, dstsize, src, srclen); - Assert(needed + 1 == dstsize); - } - - Assert(dst[needed] == '\0'); - result = dst; - } - else - { - Assert(mylocale->provider == COLLPROVIDER_LIBC); - - if (pg_database_encoding_max_length() > 1) - { - wchar_t *workspace; - size_t curr_char; - size_t result_size; - - /* Overflow paranoia */ - if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t))) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - - /* Output workspace cannot have more codes than input bytes */ - workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); - - char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale); - - for (curr_char = 0; workspace[curr_char] != 0; curr_char++) - workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt); - - /* - * Make result large enough; case change might change number - * of bytes - */ - result_size = curr_char * pg_database_encoding_max_length() + 1; - result = palloc(result_size); - wchar2char(result, workspace, result_size, mylocale); - pfree(workspace); - } - else - { - char *p; - - result = pnstrdup(buff, nbytes); - - /* - * Note: we assume that toupper_l() will not be so broken as - * to need an islower_l() guard test. When using the default - * collation, we apply the traditional Postgres behavior that - * forces ASCII-style treatment of I/i, but in non-default - * collations you get exactly what the collation says. - */ - for (p = result; *p; p++) - { - if (mylocale->is_default) - *p = pg_toupper((unsigned char) *p); - else - *p = toupper_l((unsigned char) *p, mylocale->info.lt); - } - } - } + Assert(dst[needed] == '\0'); + result = dst; } return result; } -struct WordBoundaryState -{ - const char *str; - size_t len; - size_t offset; - bool init; - bool prev_alnum; -}; - -/* - * Simple word boundary iterator that draws boundaries each time the result of - * pg_u_isalnum() changes. - */ -static size_t -initcap_wbnext(void *state) -{ - struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state; - - while (wbstate->offset < wbstate->len && - wbstate->str[wbstate->offset] != '\0') - { - pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str + - wbstate->offset); - bool curr_alnum = pg_u_isalnum(u, true); - - if (!wbstate->init || curr_alnum != wbstate->prev_alnum) - { - size_t prev_offset = wbstate->offset; - - wbstate->init = true; - wbstate->offset += unicode_utf8len(u); - wbstate->prev_alnum = curr_alnum; - return prev_offset; - } - - wbstate->offset += unicode_utf8len(u); - } - - return wbstate->len; -} - /* * collation-aware, wide-character-aware initcap function * @@ -1962,7 +1719,6 @@ char * str_initcap(const char *buff, size_t nbytes, Oid collid) { char *result; - int wasalnum = false; pg_locale_t mylocale; if (!buff) @@ -1990,135 +1746,28 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) } else { -#ifdef USE_ICU - if (mylocale->provider == COLLPROVIDER_ICU) + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize; + char *dst; + size_t needed; + + /* first try buffer of equal size plus terminating NUL */ + dstsize = srclen + 1; + dst = palloc(dstsize); + + needed = pg_strtitle(dst, dstsize, src, srclen, mylocale); + if (needed + 1 > dstsize) { - int32_t len_uchar, - len_conv; - UChar *buff_uchar; - UChar *buff_conv; - - len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes); - len_conv = icu_convert_case(u_strToTitle_default_BI, mylocale, - &buff_conv, buff_uchar, len_uchar); - icu_from_uchar(&result, buff_conv, len_conv); - pfree(buff_uchar); - pfree(buff_conv); + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = pg_strtitle(dst, dstsize, src, srclen, mylocale); + Assert(needed + 1 <= dstsize); } - else -#endif - if (mylocale->provider == COLLPROVIDER_BUILTIN) - { - const char *src = buff; - size_t srclen = nbytes; - size_t dstsize; - char *dst; - size_t needed; - struct WordBoundaryState wbstate = { - .str = src, - .len = srclen, - .offset = 0, - .init = false, - .prev_alnum = false, - }; - - Assert(GetDatabaseEncoding() == PG_UTF8); - - /* first try buffer of equal size plus terminating NUL */ - dstsize = srclen + 1; - dst = palloc(dstsize); - - needed = unicode_strtitle(dst, dstsize, src, srclen, - initcap_wbnext, &wbstate); - if (needed + 1 > dstsize) - { - /* reset iterator */ - wbstate.offset = 0; - wbstate.init = false; - - /* grow buffer if needed and retry */ - dstsize = needed + 1; - dst = repalloc(dst, dstsize); - needed = unicode_strtitle(dst, dstsize, src, srclen, - initcap_wbnext, &wbstate); - Assert(needed + 1 == dstsize); - } - result = dst; - } - else - { - Assert(mylocale->provider == COLLPROVIDER_LIBC); - - if (pg_database_encoding_max_length() > 1) - { - wchar_t *workspace; - size_t curr_char; - size_t result_size; - - /* Overflow paranoia */ - if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t))) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - - /* Output workspace cannot have more codes than input bytes */ - workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t)); - - char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale); - - for (curr_char = 0; workspace[curr_char] != 0; curr_char++) - { - if (wasalnum) - workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt); - else - workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt); - wasalnum = iswalnum_l(workspace[curr_char], mylocale->info.lt); - } - - /* - * Make result large enough; case change might change number - * of bytes - */ - result_size = curr_char * pg_database_encoding_max_length() + 1; - result = palloc(result_size); - - wchar2char(result, workspace, result_size, mylocale); - pfree(workspace); - } - else - { - char *p; - - result = pnstrdup(buff, nbytes); - - /* - * Note: we assume that toupper_l()/tolower_l() will not be so - * broken as to need guard tests. When using the default - * collation, we apply the traditional Postgres behavior that - * forces ASCII-style treatment of I/i, but in non-default - * collations you get exactly what the collation says. - */ - for (p = result; *p; p++) - { - if (mylocale->is_default) - { - if (wasalnum) - *p = pg_tolower((unsigned char) *p); - else - *p = pg_toupper((unsigned char) *p); - } - else - { - if (wasalnum) - *p = tolower_l((unsigned char) *p, mylocale->info.lt); - else - *p = toupper_l((unsigned char) *p, mylocale->info.lt); - } - wasalnum = isalnum_l((unsigned char) *p, mylocale->info.lt); - } - } - } + Assert(dst[needed] == '\0'); + result = dst; } return result; |