diff options
-rw-r--r-- | doc/src/sgml/charset.sgml | 27 | ||||
-rw-r--r-- | doc/src/sgml/ref/create_collation.sgml | 2 | ||||
-rw-r--r-- | doc/src/sgml/ref/create_database.sgml | 13 | ||||
-rw-r--r-- | doc/src/sgml/ref/initdb.sgml | 5 | ||||
-rw-r--r-- | src/backend/regex/regc_pg_locale.c | 36 | ||||
-rw-r--r-- | src/backend/utils/adt/formatting.c | 112 | ||||
-rw-r--r-- | src/backend/utils/adt/pg_locale.c | 52 | ||||
-rw-r--r-- | src/bin/initdb/initdb.c | 16 | ||||
-rw-r--r-- | src/bin/initdb/t/001_initdb.pl | 17 | ||||
-rw-r--r-- | src/bin/pg_upgrade/t/002_pg_upgrade.pl | 2 | ||||
-rw-r--r-- | src/bin/scripts/t/020_createdb.pl | 18 | ||||
-rw-r--r-- | src/include/catalog/catversion.h | 2 | ||||
-rw-r--r-- | src/include/catalog/pg_collation.dat | 3 | ||||
-rw-r--r-- | src/test/regress/expected/collate.utf8.out | 136 | ||||
-rw-r--r-- | src/test/regress/expected/collate.utf8_1.out | 8 | ||||
-rw-r--r-- | src/test/regress/parallel_schedule | 4 | ||||
-rw-r--r-- | src/test/regress/sql/collate.utf8.sql | 67 |
17 files changed, 494 insertions, 26 deletions
diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 7114eb7b522..55bbb20dacc 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -377,13 +377,21 @@ initdb --locale-provider=icu --icu-locale=en <listitem> <para> The <literal>builtin</literal> provider uses built-in operations. Only - the <literal>C</literal> locale is supported for this provider. + the <literal>C</literal> and <literal>C.UTF-8</literal> locales are + supported for this provider. </para> <para> The <literal>C</literal> locale behavior is identical to the <literal>C</literal> locale in the libc provider. When using this locale, the behavior may depend on the database encoding. </para> + <para> + The <literal>C.UTF-8</literal> locale is available only for when the + database encoding is <literal>UTF-8</literal>, and the behavior is + based on Unicode. The collation uses the code point values only. The + regular expression character classes are based on the "POSIX + Compatible" semantics, and the case mapping is the "simple" variant. + </para> </listitem> </varlistentry> @@ -879,6 +887,23 @@ SELECT * FROM test1 ORDER BY a || b COLLATE "fr_FR"; </varlistentry> <varlistentry> + <term><literal>pg_c_utf8</literal></term> + <listitem> + <para> + This collation sorts by Unicode code point values rather than natural + language order. For the functions <function>lower</function>, + <function>initcap</function>, and <function>upper</function>, it uses + Unicode simple case mapping. For pattern matching (including regular + expressions), it uses the POSIX Compatible variant of Unicode <ulink + url="https://www.unicode.org/reports/tr18/#Compatibility_Properties">Compatibility + Properties</ulink>. Behavior is efficient and stable within a + <productname>Postgres</productname> major version. This collation is + only available for encoding <literal>UTF8</literal>. + </para> + </listitem> + </varlistentry> + + <varlistentry> <term><literal>C</literal> (equivalent to <literal>POSIX</literal>)</term> <listitem> <para> diff --git a/doc/src/sgml/ref/create_collation.sgml b/doc/src/sgml/ref/create_collation.sgml index 98cd7d56be9..85f18cbbe5d 100644 --- a/doc/src/sgml/ref/create_collation.sgml +++ b/doc/src/sgml/ref/create_collation.sgml @@ -99,7 +99,7 @@ CREATE COLLATION [ IF NOT EXISTS ] <replaceable>name</replaceable> FROM <replace <para> If <replaceable>provider</replaceable> is <literal>builtin</literal>, then <replaceable>locale</replaceable> must be specified and set to - <literal>C</literal>. + either <literal>C</literal> or <literal>C.UTF-8</literal>. </para> </listitem> </varlistentry> diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml index 233ff1755dd..7653cb902ee 100644 --- a/doc/src/sgml/ref/create_database.sgml +++ b/doc/src/sgml/ref/create_database.sgml @@ -166,8 +166,9 @@ CREATE DATABASE <replaceable class="parameter">name</replaceable> </para> <para> If <xref linkend="create-database-locale-provider"/> is - <literal>builtin</literal>, then <replaceable>locale</replaceable> - must be specified and set to <literal>C</literal>. + <literal>builtin</literal>, then <replaceable>locale</replaceable> or + <replaceable>builtin_locale</replaceable> must be specified and set to + either <literal>C</literal> or <literal>C.UTF-8</literal>. </para> <tip> <para> @@ -228,9 +229,11 @@ CREATE DATABASE <replaceable class="parameter">name</replaceable> linkend="create-database-locale-provider">locale provider</link> must be <literal>builtin</literal>. The default is the setting of <xref linkend="create-database-locale"/> if specified; otherwise the same - setting as the template database. Currently, the only available - locale for the <literal>builtin</literal> provider is - <literal>C</literal>. + setting as the template database. + </para> + <para> + The locales available for the <literal>builtin</literal> provider are + <literal>C</literal> and <literal>C.UTF-8</literal>. </para> </listitem> </varlistentry> diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index 4760570f6ab..377c3cb20aa 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -288,8 +288,9 @@ PostgreSQL documentation </para> <para> If <option>--locale-provider</option> is <literal>builtin</literal>, - <option>--locale</option> must be specified and set to - <literal>C</literal>. + <option>--locale</option> or <option>--builtin-locale</option> must be + specified and set to <literal>C</literal> or + <literal>C.UTF-8</literal>. </para> </listitem> </varlistentry> diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 6a26388bfa3..85f3238eb07 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -16,6 +16,8 @@ */ #include "catalog/pg_collation.h" +#include "common/unicode_case.h" +#include "common/unicode_category.h" #include "utils/pg_locale.h" /* @@ -64,6 +66,7 @@ typedef enum { PG_REGEX_LOCALE_C, /* C locale (encoding independent) */ + PG_REGEX_BUILTIN, /* built-in Unicode semantics */ PG_REGEX_LOCALE_WIDE, /* Use <wctype.h> functions */ PG_REGEX_LOCALE_1BYTE, /* Use <ctype.h> functions */ PG_REGEX_LOCALE_WIDE_L, /* Use locale_t <wctype.h> functions */ @@ -266,7 +269,12 @@ pg_set_regex_collation(Oid collation) if (GetDatabaseEncoding() == PG_UTF8) { if (pg_regex_locale) - pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + { + if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN) + pg_regex_strategy = PG_REGEX_BUILTIN; + else + pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + } else pg_regex_strategy = PG_REGEX_LOCALE_WIDE; } @@ -290,6 +298,8 @@ pg_wc_isdigit(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISDIGIT)); + case PG_REGEX_BUILTIN: + return pg_u_isdigit(c, true); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswdigit((wint_t) c); @@ -322,6 +332,8 @@ pg_wc_isalpha(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALPHA)); + case PG_REGEX_BUILTIN: + return pg_u_isalpha(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalpha((wint_t) c); @@ -354,6 +366,8 @@ pg_wc_isalnum(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALNUM)); + case PG_REGEX_BUILTIN: + return pg_u_isalnum(c, true); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalnum((wint_t) c); @@ -395,6 +409,8 @@ pg_wc_isupper(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISUPPER)); + case PG_REGEX_BUILTIN: + return pg_u_isupper(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswupper((wint_t) c); @@ -427,6 +443,8 @@ pg_wc_islower(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISLOWER)); + case PG_REGEX_BUILTIN: + return pg_u_islower(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswlower((wint_t) c); @@ -459,6 +477,8 @@ pg_wc_isgraph(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISGRAPH)); + case PG_REGEX_BUILTIN: + return pg_u_isgraph(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswgraph((wint_t) c); @@ -491,6 +511,8 @@ pg_wc_isprint(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPRINT)); + case PG_REGEX_BUILTIN: + return pg_u_isprint(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswprint((wint_t) c); @@ -523,6 +545,8 @@ pg_wc_ispunct(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPUNCT)); + case PG_REGEX_BUILTIN: + return pg_u_ispunct(c, true); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswpunct((wint_t) c); @@ -555,6 +579,8 @@ pg_wc_isspace(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISSPACE)); + case PG_REGEX_BUILTIN: + return pg_u_isspace(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswspace((wint_t) c); @@ -588,6 +614,8 @@ pg_wc_toupper(pg_wchar c) if (c <= (pg_wchar) 127) return pg_ascii_toupper((unsigned char) c); return c; + case PG_REGEX_BUILTIN: + return unicode_uppercase_simple(c); case PG_REGEX_LOCALE_WIDE: /* force C behavior for ASCII characters, per comments above */ if (c <= (pg_wchar) 127) @@ -628,6 +656,8 @@ pg_wc_tolower(pg_wchar c) if (c <= (pg_wchar) 127) return pg_ascii_tolower((unsigned char) c); return c; + case PG_REGEX_BUILTIN: + return unicode_lowercase_simple(c); case PG_REGEX_LOCALE_WIDE: /* force C behavior for ASCII characters, per comments above */ if (c <= (pg_wchar) 127) @@ -792,6 +822,9 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) max_chr = (pg_wchar) MAX_SIMPLE_CHR; #endif break; + case PG_REGEX_BUILTIN: + max_chr = (pg_wchar) MAX_SIMPLE_CHR; + break; case PG_REGEX_LOCALE_WIDE: case PG_REGEX_LOCALE_WIDE_L: max_chr = (pg_wchar) MAX_SIMPLE_CHR; @@ -809,6 +842,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) max_chr = (pg_wchar) MAX_SIMPLE_CHR; break; default: + Assert(false); max_chr = 0; /* can't get here, but keep compiler quiet */ break; } diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 5f483b8dbc2..8160d78ec6d 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -77,6 +77,8 @@ #include "catalog/pg_collation.h" #include "catalog/pg_type.h" +#include "common/unicode_case.h" +#include "common/unicode_category.h" #include "mb/pg_wchar.h" #include "nodes/miscnodes.h" #include "parser/scansup.h" @@ -1679,6 +1681,34 @@ str_tolower(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize; + char *dst; + size_t needed; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* first try buffer of equal size plus terminating NUL */ + dstsize = srclen + 1; + dst = palloc(dstsize); + + needed = unicode_strlower(dst, dstsize, src, srclen); + if (needed + 1 > dstsize) + { + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = unicode_strlower(dst, dstsize, src, srclen); + Assert(needed + 1 == dstsize); + } + + Assert(dst[needed] == '\0'); + result = dst; + } + else { Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC); @@ -1799,6 +1829,34 @@ str_toupper(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize; + char *dst; + size_t needed; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* first try buffer of equal size plus terminating NUL */ + dstsize = srclen + 1; + dst = palloc(dstsize); + + needed = unicode_strupper(dst, dstsize, src, srclen); + if (needed + 1 > dstsize) + { + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = unicode_strupper(dst, dstsize, src, srclen); + Assert(needed + 1 == dstsize); + } + + Assert(dst[needed] == '\0'); + result = dst; + } + else { Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC); @@ -1920,6 +1978,60 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) } else #endif + if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN) + { + const unsigned char *src = (unsigned char *) buff; + size_t srclen = nbytes; + unsigned char *dst; + size_t dstsize; + int srcoff = 0; + int dstoff = 0; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + /* overflow paranoia */ + if ((srclen + 1) > (INT_MAX / MAX_MULTIBYTE_CHAR_LEN)) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + /* result is at most srclen codepoints plus terminating NUL */ + dstsize = srclen * MAX_MULTIBYTE_CHAR_LEN + 1; + dst = (unsigned char *) palloc(dstsize); + + while (srcoff < nbytes) + { + pg_wchar u1 = utf8_to_unicode(src + srcoff); + pg_wchar u2; + int u1len = unicode_utf8len(u1); + int u2len; + + if (wasalnum) + u2 = unicode_lowercase_simple(u1); + else + u2 = unicode_uppercase_simple(u1); + + u2len = unicode_utf8len(u2); + + Assert(dstoff + u2len + 1 <= dstsize); + + wasalnum = pg_u_isalnum(u2, true); + + unicode_to_utf8(u2, dst + dstoff); + srcoff += u1len; + dstoff += u2len; + } + + Assert(dstoff + 1 <= dstsize); + *(dst + dstoff) = '\0'; + dstoff++; + + /* allocate result buffer of the right size and free workspace */ + result = palloc(dstoff); + memcpy(result, dst, dstoff); + pfree(dst); + } + else { Assert(!mylocale || mylocale->provider == COLLPROVIDER_LIBC); diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 3f311e99076..e10d328fc3a 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1270,8 +1270,14 @@ lookup_collation_cache(Oid collation, bool set_flags) if (collform->collprovider == COLLPROVIDER_BUILTIN) { + Datum datum; + const char *colllocale; + + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); + colllocale = TextDatumGetCString(datum); + cache_entry->collate_is_c = true; - cache_entry->ctype_is_c = true; + cache_entry->ctype_is_c = (strcmp(colllocale, "C") == 0); } else if (collform->collprovider == COLLPROVIDER_LIBC) { @@ -1670,7 +1676,6 @@ pg_newlocale_from_collation(Oid collid) collversionstr = TextDatumGetCString(datum); - Assert(collform->collprovider != COLLPROVIDER_BUILTIN); if (collform->collprovider == COLLPROVIDER_LIBC) datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate); else @@ -1725,7 +1730,13 @@ get_collation_actual_version(char collprovider, const char *collcollate) { char *collversion = NULL; - /* the builtin collation provider is not versioned */ + /* + * The only two supported locales (C and C.UTF-8) are both based on memcmp + * and are not expected to change. + * + * Note that the character semantics may change for some locales, but the + * collation version only tracks changes to sort order. + */ if (collprovider == COLLPROVIDER_BUILTIN) return NULL; @@ -2505,13 +2516,17 @@ pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, int builtin_locale_encoding(const char *locale) { - if (strcmp(locale, "C") != 0) - ereport(ERROR, - (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("invalid locale name \"%s\" for builtin provider", - locale))); + if (strcmp(locale, "C") == 0) + return -1; + if (strcmp(locale, "C.UTF-8") == 0) + return PG_UTF8; + + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("invalid locale name \"%s\" for builtin provider", + locale))); - return -1; + return 0; /* keep compiler quiet */ } @@ -2525,13 +2540,28 @@ builtin_locale_encoding(const char *locale) const char * builtin_validate_locale(int encoding, const char *locale) { - if (strcmp(locale, "C") != 0) + const char *canonical_name = NULL; + int required_encoding; + + if (strcmp(locale, "C") == 0) + canonical_name = "C"; + else if (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0) + canonical_name = "C.UTF-8"; + + if (!canonical_name) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("invalid locale name \"%s\" for builtin provider", locale))); - return "C"; + required_encoding = builtin_locale_encoding(canonical_name); + if (required_encoding >= 0 && encoding != required_encoding) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("encoding \"%s\" does not match locale \"%s\"", + pg_encoding_to_char(encoding), locale))); + + return canonical_name; } diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index c2daff17179..30e17bd1d1e 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2403,9 +2403,16 @@ setlocales(void) if (locale_provider == COLLPROVIDER_BUILTIN) { - if (strcmp(datlocale, "C") != 0) + if (strcmp(datlocale, "C") == 0) + canonname = "C"; + else if (strcmp(datlocale, "C.UTF-8") == 0 || + strcmp(datlocale, "C.UTF8") == 0) + canonname = "C.UTF-8"; + else pg_fatal("invalid locale name \"%s\" for builtin provider", datlocale); + + datlocale = canonname; } else if (locale_provider == COLLPROVIDER_ICU) { @@ -2695,6 +2702,13 @@ setup_locale_encoding(void) !check_locale_encoding(lc_collate, encodingid)) exit(1); /* check_locale_encoding printed the error */ + if (locale_provider == COLLPROVIDER_BUILTIN) + { + if (strcmp(datlocale, "C.UTF-8") == 0 && encodingid != PG_UTF8) + pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"", + datlocale, "UTF-8"); + } + if (locale_provider == COLLPROVIDER_ICU && !check_icu_locale_encoding(encodingid)) exit(1); diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl index 3478f58b02a..c63d3206d99 100644 --- a/src/bin/initdb/t/001_initdb.pl +++ b/src/bin/initdb/t/001_initdb.pl @@ -199,6 +199,23 @@ command_ok( command_ok( [ 'initdb', '--no-sync', + '--locale-provider=builtin', '-E UTF-8', + '--builtin-locale=C.UTF-8', "$tempdir/data8" + ], + 'locale provider builtin with -E UTF-8 --builtin-locale=C.UTF-8'); + +command_fails( + [ + 'initdb', '--no-sync', + '--locale-provider=builtin', '-E SQL_ASCII', + '--builtin-locale=C.UTF-8', "$tempdir/data9" + ], + 'locale provider builtin with --builtin-locale=C.UTF-8 fails for SQL_ASCII' +); + +command_ok( + [ + 'initdb', '--no-sync', '--locale-provider=builtin', '--lc-ctype=C', '--locale=C', "$tempdir/data10" ], diff --git a/src/bin/pg_upgrade/t/002_pg_upgrade.pl b/src/bin/pg_upgrade/t/002_pg_upgrade.pl index ed79c0930b0..3e67121a8df 100644 --- a/src/bin/pg_upgrade/t/002_pg_upgrade.pl +++ b/src/bin/pg_upgrade/t/002_pg_upgrade.pl @@ -140,7 +140,7 @@ if ($oldnode->pg_version >= '17devel') { $original_enc_name = "UTF-8"; $original_provider = "b"; - $original_datlocale = "C"; + $original_datlocale = "C.UTF-8"; } elsif ($oldnode->pg_version >= 15 && $ENV{with_icu} eq 'yes') { diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl index dfd635bfab2..0b371ea4dfc 100644 --- a/src/bin/scripts/t/020_createdb.pl +++ b/src/bin/scripts/t/020_createdb.pl @@ -139,6 +139,24 @@ $node->command_ok( ], 'create database with provider "builtin" and LC_CTYPE=C'); +$node->command_ok( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '-E UTF-8', '--builtin-locale=C.UTF8', + 'tbuiltin5' + ], + 'create database with --builtin-locale C.UTF-8 and -E UTF-8'); + +$node->command_fails( + [ + 'createdb', '-T', + 'template0', '--locale-provider=builtin', + '-E LATIN1', '--builtin-locale=C.UTF-8', + 'tbuiltin6' + ], + 'create database with --builtin-locale C.UTF-8 and -E LATIN1'); + $node->command_fails( [ 'createdb', '-T', diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 295560a7ffd..be18328ea51 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202403191 +#define CATALOG_VERSION_NO 202403192 #endif diff --git a/src/include/catalog/pg_collation.dat b/src/include/catalog/pg_collation.dat index 938432e8a4b..083b0cdccaa 100644 --- a/src/include/catalog/pg_collation.dat +++ b/src/include/catalog/pg_collation.dat @@ -30,5 +30,8 @@ descr => 'sorts using the Unicode Collation Algorithm with default settings', collname => 'unicode', collprovider => 'i', collencoding => '-1', colllocale => 'und' }, +{ oid => '811', descr => 'sorts by Unicode code point; Unicode and POSIX character semantics', + collname => 'pg_c_utf8', collprovider => 'b', collencoding => '6', + colllocale => 'C.UTF-8' }, ] diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out new file mode 100644 index 00000000000..eff0ef21ac5 --- /dev/null +++ b/src/test/regress/expected/collate.utf8.out @@ -0,0 +1,136 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +SET client_encoding TO UTF8; +-- +-- Test PG_C_UTF8 +-- +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C_UTF8'); -- fails +ERROR: invalid locale name "C_UTF8" for builtin provider +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C.UTF8'); +DROP COLLATION regress_pg_c_utf8; +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C.UTF-8'); +CREATE TABLE test_pg_c_utf8 ( + t TEXT COLLATE PG_C_UTF8 +); +INSERT INTO test_pg_c_utf8 VALUES + ('abc DEF 123abc'), + ('ábc sßs ßss DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM test_pg_c_utf8; + t | lower | initcap | upper | t_bytes | lower_t_bytes | initcap_t_bytes | upper_t_bytes +-----------------+-----------------+-----------------+-----------------+---------+---------------+-----------------+--------------- + abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14 + ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs ßss Déf | ÁBC SßS ßSS DÉF | 19 | 19 | 19 | 19 + DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | DŽxxdž DŽxxdž DŽxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20 + ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6 + ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6 + ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4 +(6 rows) + +DROP TABLE test_pg_c_utf8; +-- negative test: Final_Sigma not used for builtin locale C.UTF-8 +SELECT lower('ΑΣ' COLLATE PG_C_UTF8); + lower +------- + ασ +(1 row) + +SELECT lower('ΑͺΣͺ' COLLATE PG_C_UTF8); + lower +------- + αͺσͺ +(1 row) + +SELECT lower('Α΄Σ΄' COLLATE PG_C_UTF8); + lower +------- + α΄σ΄ +(1 row) + +-- properties +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '=' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix + ?column? +---------- + t +(1 row) + +SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT '൧' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix + ?column? +---------- + t +(1 row) + +-- case mapping +SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8; + ?column? +---------- + t +(1 row) + +SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed + ?column? +---------- + t +(1 row) + diff --git a/src/test/regress/expected/collate.utf8_1.out b/src/test/regress/expected/collate.utf8_1.out new file mode 100644 index 00000000000..e73fdf50c30 --- /dev/null +++ b/src/test/regress/expected/collate.utf8_1.out @@ -0,0 +1,8 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 1d8a414eea7..e48cb4b7a38 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -78,9 +78,9 @@ test: brin_bloom brin_multi # psql depends on create_am # amutils depends on geometry, create_index_spgist, hash_index, brin # ---------- -test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.icu.utf8 incremental_sort create_role without_overlaps +test: create_table_like alter_generic alter_operator misc async dbsize merge misc_functions sysviews tsrf tid tidscan tidrangescan collate.utf8 collate.icu.utf8 incremental_sort create_role without_overlaps -# collate.*.utf8 tests cannot be run in parallel with each other +# collate.linux.utf8 and collate.icu.utf8 tests cannot be run in parallel with each other test: rules psql psql_crosstab amutils stats_ext collate.linux.utf8 collate.windows.win1252 # ---------- diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql new file mode 100644 index 00000000000..1f5f9ef491d --- /dev/null +++ b/src/test/regress/sql/collate.utf8.sql @@ -0,0 +1,67 @@ +/* + * This test is for collations and character operations when using the + * builtin provider with the C.UTF-8 locale. + */ + +/* skip test if not UTF8 server encoding */ +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +SET client_encoding TO UTF8; + +-- +-- Test PG_C_UTF8 +-- + +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C_UTF8'); -- fails +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C.UTF8'); +DROP COLLATION regress_pg_c_utf8; +CREATE COLLATION regress_pg_c_utf8 ( + provider = builtin, locale = 'C.UTF-8'); + +CREATE TABLE test_pg_c_utf8 ( + t TEXT COLLATE PG_C_UTF8 +); +INSERT INTO test_pg_c_utf8 VALUES + ('abc DEF 123abc'), + ('ábc sßs ßss DÉF'), + ('DŽxxDŽ džxxDž Džxxdž'), + ('ȺȺȺ'), + ('ⱥⱥⱥ'), + ('ⱥȺ'); + +SELECT + t, lower(t), initcap(t), upper(t), + length(convert_to(t, 'UTF8')) AS t_bytes, + length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, + length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, + length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes + FROM test_pg_c_utf8; + +DROP TABLE test_pg_c_utf8; + +-- negative test: Final_Sigma not used for builtin locale C.UTF-8 +SELECT lower('ΑΣ' COLLATE PG_C_UTF8); +SELECT lower('ΑͺΣͺ' COLLATE PG_C_UTF8); +SELECT lower('Α΄Σ΄' COLLATE PG_C_UTF8); + +-- properties + +SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8; +SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8; +SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8; +SELECT '=' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix +SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8; +SELECT '൧' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix + +-- case mapping + +SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8; +SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8; +SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8; +SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8; +SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed |