aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/formatting.c
diff options
context:
space:
mode:
authorJeff Davis <jdavis@postgresql.org>2024-12-16 09:35:18 -0800
committerJeff Davis <jdavis@postgresql.org>2024-12-16 09:35:18 -0800
commit86a5d6006aff956a5e00982b7628177fa7dc5027 (patch)
tree90edf4aeac487e9dba957a5fbb75f3d6ae9e58f6 /src/backend/utils/adt/formatting.c
parentde1e29885730851787b467449f525ff6fc7d69fa (diff)
downloadpostgresql-86a5d6006aff956a5e00982b7628177fa7dc5027.tar.gz
postgresql-86a5d6006aff956a5e00982b7628177fa7dc5027.zip
Refactor string case conversion into provider-specific files.
Create API entry points pg_strlower(), etc., that work with any provider and give the caller control over the destination buffer. Then, move provider-specific logic into pg_locale_builtin.c, pg_locale_icu.c, and pg_locale_libc.c as appropriate. Discussion: https://postgr.es/m/7aa46d77b377428058403723440862d12a8a129a.camel@j-davis.com
Diffstat (limited to 'src/backend/utils/adt/formatting.c')
-rw-r--r--src/backend/utils/adt/formatting.c465
1 files changed, 57 insertions, 408 deletions
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 0dcb5515119..30c06c8d099 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -1571,52 +1571,6 @@ str_numth(char *dest, char *num, int type)
* upper/lower/initcap functions
*****************************************************************************/
-#ifdef USE_ICU
-
-typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- const char *locale,
- UErrorCode *pErrorCode);
-
-static int32_t
-icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
- UChar **buff_dest, UChar *buff_source, int32_t len_source)
-{
- UErrorCode status;
- int32_t len_dest;
-
- len_dest = len_source; /* try first with same length */
- *buff_dest = palloc(len_dest * sizeof(**buff_dest));
- status = U_ZERO_ERROR;
- len_dest = func(*buff_dest, len_dest, buff_source, len_source,
- mylocale->info.icu.locale, &status);
- if (status == U_BUFFER_OVERFLOW_ERROR)
- {
- /* try again with adjusted length */
- pfree(*buff_dest);
- *buff_dest = palloc(len_dest * sizeof(**buff_dest));
- status = U_ZERO_ERROR;
- len_dest = func(*buff_dest, len_dest, buff_source, len_source,
- mylocale->info.icu.locale, &status);
- }
- if (U_FAILURE(status))
- ereport(ERROR,
- (errmsg("case conversion failed: %s", u_errorName(status))));
- return len_dest;
-}
-
-static int32_t
-u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
- const UChar *src, int32_t srcLength,
- const char *locale,
- UErrorCode *pErrorCode)
-{
- return u_strToTitle(dest, destCapacity, src, srcLength,
- NULL, locale, pErrorCode);
-}
-
-#endif /* USE_ICU */
-
/*
* If the system provides the needed functions for wide-character manipulation
* (which are all standardized by C99), then we implement upper/lower/initcap
@@ -1664,106 +1618,28 @@ str_tolower(const char *buff, size_t nbytes, Oid collid)
}
else
{
-#ifdef USE_ICU
- if (mylocale->provider == COLLPROVIDER_ICU)
- {
- int32_t len_uchar;
- int32_t len_conv;
- UChar *buff_uchar;
- UChar *buff_conv;
-
- len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes);
- len_conv = icu_convert_case(u_strToLower, mylocale,
- &buff_conv, buff_uchar, len_uchar);
- icu_from_uchar(&result, buff_conv, len_conv);
- pfree(buff_uchar);
- pfree(buff_conv);
- }
- else
-#endif
- if (mylocale->provider == COLLPROVIDER_BUILTIN)
+ const char *src = buff;
+ size_t srclen = nbytes;
+ size_t dstsize;
+ char *dst;
+ size_t needed;
+
+ /* first try buffer of equal size plus terminating NUL */
+ dstsize = srclen + 1;
+ dst = palloc(dstsize);
+
+ needed = pg_strlower(dst, dstsize, src, srclen, mylocale);
+ if (needed + 1 > dstsize)
{
- const char *src = buff;
- size_t srclen = nbytes;
- size_t dstsize;
- char *dst;
- size_t needed;
-
- Assert(GetDatabaseEncoding() == PG_UTF8);
-
- /* first try buffer of equal size plus terminating NUL */
- dstsize = srclen + 1;
- dst = palloc(dstsize);
-
- needed = unicode_strlower(dst, dstsize, src, srclen);
- if (needed + 1 > dstsize)
- {
- /* grow buffer if needed and retry */
- dstsize = needed + 1;
- dst = repalloc(dst, dstsize);
- needed = unicode_strlower(dst, dstsize, src, srclen);
- Assert(needed + 1 == dstsize);
- }
-
- Assert(dst[needed] == '\0');
- result = dst;
+ /* grow buffer if needed and retry */
+ dstsize = needed + 1;
+ dst = repalloc(dst, dstsize);
+ needed = pg_strlower(dst, dstsize, src, srclen, mylocale);
+ Assert(needed + 1 <= dstsize);
}
- else
- {
- Assert(mylocale->provider == COLLPROVIDER_LIBC);
-
- if (pg_database_encoding_max_length() > 1)
- {
- wchar_t *workspace;
- size_t curr_char;
- size_t result_size;
-
- /* Overflow paranoia */
- if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
- ereport(ERROR,
- (errcode(ERRCODE_OUT_OF_MEMORY),
- errmsg("out of memory")));
-
- /* Output workspace cannot have more codes than input bytes */
- workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
-
- char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
-
- for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
- workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt);
-
- /*
- * Make result large enough; case change might change number
- * of bytes
- */
- result_size = curr_char * pg_database_encoding_max_length() + 1;
- result = palloc(result_size);
- wchar2char(result, workspace, result_size, mylocale);
- pfree(workspace);
- }
- else
- {
- char *p;
-
- result = pnstrdup(buff, nbytes);
-
- /*
- * Note: we assume that tolower_l() will not be so broken as
- * to need an isupper_l() guard test. When using the default
- * collation, we apply the traditional Postgres behavior that
- * forces ASCII-style treatment of I/i, but in non-default
- * collations you get exactly what the collation says.
- */
- for (p = result; *p; p++)
- {
- if (mylocale->is_default)
- *p = pg_tolower((unsigned char) *p);
- else
- *p = tolower_l((unsigned char) *p, mylocale->info.lt);
- }
- }
- }
+ Assert(dst[needed] == '\0');
+ result = dst;
}
return result;
@@ -1806,152 +1682,33 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
}
else
{
-#ifdef USE_ICU
- if (mylocale->provider == COLLPROVIDER_ICU)
+ const char *src = buff;
+ size_t srclen = nbytes;
+ size_t dstsize;
+ char *dst;
+ size_t needed;
+
+ /* first try buffer of equal size plus terminating NUL */
+ dstsize = srclen + 1;
+ dst = palloc(dstsize);
+
+ needed = pg_strupper(dst, dstsize, src, srclen, mylocale);
+ if (needed + 1 > dstsize)
{
- int32_t len_uchar,
- len_conv;
- UChar *buff_uchar;
- UChar *buff_conv;
-
- len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes);
- len_conv = icu_convert_case(u_strToUpper, mylocale,
- &buff_conv, buff_uchar, len_uchar);
- icu_from_uchar(&result, buff_conv, len_conv);
- pfree(buff_uchar);
- pfree(buff_conv);
+ /* grow buffer if needed and retry */
+ dstsize = needed + 1;
+ dst = repalloc(dst, dstsize);
+ needed = pg_strupper(dst, dstsize, src, srclen, mylocale);
+ Assert(needed + 1 <= dstsize);
}
- else
-#endif
- if (mylocale->provider == COLLPROVIDER_BUILTIN)
- {
- const char *src = buff;
- size_t srclen = nbytes;
- size_t dstsize;
- char *dst;
- size_t needed;
-
- Assert(GetDatabaseEncoding() == PG_UTF8);
-
- /* first try buffer of equal size plus terminating NUL */
- dstsize = srclen + 1;
- dst = palloc(dstsize);
-
- needed = unicode_strupper(dst, dstsize, src, srclen);
- if (needed + 1 > dstsize)
- {
- /* grow buffer if needed and retry */
- dstsize = needed + 1;
- dst = repalloc(dst, dstsize);
- needed = unicode_strupper(dst, dstsize, src, srclen);
- Assert(needed + 1 == dstsize);
- }
-
- Assert(dst[needed] == '\0');
- result = dst;
- }
- else
- {
- Assert(mylocale->provider == COLLPROVIDER_LIBC);
-
- if (pg_database_encoding_max_length() > 1)
- {
- wchar_t *workspace;
- size_t curr_char;
- size_t result_size;
-
- /* Overflow paranoia */
- if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
- ereport(ERROR,
- (errcode(ERRCODE_OUT_OF_MEMORY),
- errmsg("out of memory")));
-
- /* Output workspace cannot have more codes than input bytes */
- workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
-
- char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
-
- for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
- workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt);
-
- /*
- * Make result large enough; case change might change number
- * of bytes
- */
- result_size = curr_char * pg_database_encoding_max_length() + 1;
- result = palloc(result_size);
- wchar2char(result, workspace, result_size, mylocale);
- pfree(workspace);
- }
- else
- {
- char *p;
-
- result = pnstrdup(buff, nbytes);
-
- /*
- * Note: we assume that toupper_l() will not be so broken as
- * to need an islower_l() guard test. When using the default
- * collation, we apply the traditional Postgres behavior that
- * forces ASCII-style treatment of I/i, but in non-default
- * collations you get exactly what the collation says.
- */
- for (p = result; *p; p++)
- {
- if (mylocale->is_default)
- *p = pg_toupper((unsigned char) *p);
- else
- *p = toupper_l((unsigned char) *p, mylocale->info.lt);
- }
- }
- }
+ Assert(dst[needed] == '\0');
+ result = dst;
}
return result;
}
-struct WordBoundaryState
-{
- const char *str;
- size_t len;
- size_t offset;
- bool init;
- bool prev_alnum;
-};
-
-/*
- * Simple word boundary iterator that draws boundaries each time the result of
- * pg_u_isalnum() changes.
- */
-static size_t
-initcap_wbnext(void *state)
-{
- struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
-
- while (wbstate->offset < wbstate->len &&
- wbstate->str[wbstate->offset] != '\0')
- {
- pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
- wbstate->offset);
- bool curr_alnum = pg_u_isalnum(u, true);
-
- if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
- {
- size_t prev_offset = wbstate->offset;
-
- wbstate->init = true;
- wbstate->offset += unicode_utf8len(u);
- wbstate->prev_alnum = curr_alnum;
- return prev_offset;
- }
-
- wbstate->offset += unicode_utf8len(u);
- }
-
- return wbstate->len;
-}
-
/*
* collation-aware, wide-character-aware initcap function
*
@@ -1962,7 +1719,6 @@ char *
str_initcap(const char *buff, size_t nbytes, Oid collid)
{
char *result;
- int wasalnum = false;
pg_locale_t mylocale;
if (!buff)
@@ -1990,135 +1746,28 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
}
else
{
-#ifdef USE_ICU
- if (mylocale->provider == COLLPROVIDER_ICU)
+ const char *src = buff;
+ size_t srclen = nbytes;
+ size_t dstsize;
+ char *dst;
+ size_t needed;
+
+ /* first try buffer of equal size plus terminating NUL */
+ dstsize = srclen + 1;
+ dst = palloc(dstsize);
+
+ needed = pg_strtitle(dst, dstsize, src, srclen, mylocale);
+ if (needed + 1 > dstsize)
{
- int32_t len_uchar,
- len_conv;
- UChar *buff_uchar;
- UChar *buff_conv;
-
- len_uchar = icu_to_uchar(&buff_uchar, buff, nbytes);
- len_conv = icu_convert_case(u_strToTitle_default_BI, mylocale,
- &buff_conv, buff_uchar, len_uchar);
- icu_from_uchar(&result, buff_conv, len_conv);
- pfree(buff_uchar);
- pfree(buff_conv);
+ /* grow buffer if needed and retry */
+ dstsize = needed + 1;
+ dst = repalloc(dst, dstsize);
+ needed = pg_strtitle(dst, dstsize, src, srclen, mylocale);
+ Assert(needed + 1 <= dstsize);
}
- else
-#endif
- if (mylocale->provider == COLLPROVIDER_BUILTIN)
- {
- const char *src = buff;
- size_t srclen = nbytes;
- size_t dstsize;
- char *dst;
- size_t needed;
- struct WordBoundaryState wbstate = {
- .str = src,
- .len = srclen,
- .offset = 0,
- .init = false,
- .prev_alnum = false,
- };
-
- Assert(GetDatabaseEncoding() == PG_UTF8);
-
- /* first try buffer of equal size plus terminating NUL */
- dstsize = srclen + 1;
- dst = palloc(dstsize);
-
- needed = unicode_strtitle(dst, dstsize, src, srclen,
- initcap_wbnext, &wbstate);
- if (needed + 1 > dstsize)
- {
- /* reset iterator */
- wbstate.offset = 0;
- wbstate.init = false;
-
- /* grow buffer if needed and retry */
- dstsize = needed + 1;
- dst = repalloc(dst, dstsize);
- needed = unicode_strtitle(dst, dstsize, src, srclen,
- initcap_wbnext, &wbstate);
- Assert(needed + 1 == dstsize);
- }
- result = dst;
- }
- else
- {
- Assert(mylocale->provider == COLLPROVIDER_LIBC);
-
- if (pg_database_encoding_max_length() > 1)
- {
- wchar_t *workspace;
- size_t curr_char;
- size_t result_size;
-
- /* Overflow paranoia */
- if ((nbytes + 1) > (INT_MAX / sizeof(wchar_t)))
- ereport(ERROR,
- (errcode(ERRCODE_OUT_OF_MEMORY),
- errmsg("out of memory")));
-
- /* Output workspace cannot have more codes than input bytes */
- workspace = (wchar_t *) palloc((nbytes + 1) * sizeof(wchar_t));
-
- char2wchar(workspace, nbytes + 1, buff, nbytes, mylocale);
-
- for (curr_char = 0; workspace[curr_char] != 0; curr_char++)
- {
- if (wasalnum)
- workspace[curr_char] = towlower_l(workspace[curr_char], mylocale->info.lt);
- else
- workspace[curr_char] = towupper_l(workspace[curr_char], mylocale->info.lt);
- wasalnum = iswalnum_l(workspace[curr_char], mylocale->info.lt);
- }
-
- /*
- * Make result large enough; case change might change number
- * of bytes
- */
- result_size = curr_char * pg_database_encoding_max_length() + 1;
- result = palloc(result_size);
-
- wchar2char(result, workspace, result_size, mylocale);
- pfree(workspace);
- }
- else
- {
- char *p;
-
- result = pnstrdup(buff, nbytes);
-
- /*
- * Note: we assume that toupper_l()/tolower_l() will not be so
- * broken as to need guard tests. When using the default
- * collation, we apply the traditional Postgres behavior that
- * forces ASCII-style treatment of I/i, but in non-default
- * collations you get exactly what the collation says.
- */
- for (p = result; *p; p++)
- {
- if (mylocale->is_default)
- {
- if (wasalnum)
- *p = pg_tolower((unsigned char) *p);
- else
- *p = pg_toupper((unsigned char) *p);
- }
- else
- {
- if (wasalnum)
- *p = tolower_l((unsigned char) *p, mylocale->info.lt);
- else
- *p = toupper_l((unsigned char) *p, mylocale->info.lt);
- }
- wasalnum = isalnum_l((unsigned char) *p, mylocale->info.lt);
- }
- }
- }
+ Assert(dst[needed] == '\0');
+ result = dst;
}
return result;