diff options
author | Peter Eisentraut <peter_e@gmx.net> | 2017-08-21 11:22:00 -0400 |
---|---|---|
committer | Peter Eisentraut <peter_e@gmx.net> | 2017-08-21 19:21:14 -0400 |
commit | 958ffb8c286d93d1bfced17e6300d13f9634b431 (patch) | |
tree | ee8b8c141ef3a3f6c9a2bea83359c5e5324177e5 | |
parent | a79fb8e0c452a9b88206e2abd4add2b432a2596b (diff) | |
download | postgresql-958ffb8c286d93d1bfced17e6300d13f9634b431.tar.gz postgresql-958ffb8c286d93d1bfced17e6300d13f9634b431.zip |
Don't install ICU collation keyword variants
Users can still create them themselves. Instead, document Unicode TR 35
collation options for ICU, so users can create all this themselves.
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
-rw-r--r-- | doc/src/sgml/charset.sgml | 98 | ||||
-rw-r--r-- | src/backend/commands/collationcmds.c | 71 |
2 files changed, 84 insertions, 85 deletions
diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index f2a4acc1150..44e43503a61 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -665,13 +665,6 @@ SELECT a COLLATE "C" < b COLLATE "POSIX" FROM test1; </varlistentry> <varlistentry> - <term><literal>de-u-co-phonebk-x-icu</literal></term> - <listitem> - <para>German collation, phone book variant</para> - </listitem> - </varlistentry> - - <varlistentry> <term><literal>de-AT-x-icu</literal></term> <listitem> <para>German collation for Austria, default variant</para> @@ -684,13 +677,6 @@ SELECT a COLLATE "C" < b COLLATE "POSIX" FROM test1; </varlistentry> <varlistentry> - <term><literal>de-AT-u-co-phonebk-x-icu</literal></term> - <listitem> - <para>German collation for Austria, phone book variant</para> - </listitem> - </varlistentry> - - <varlistentry> <term><literal>und-x-icu</literal> (for <quote>undefined</quote>)</term> <listitem> <para> @@ -709,6 +695,90 @@ SELECT a COLLATE "C" < b COLLATE "POSIX" FROM test1; will draw an error along the lines of <quote>collation "de-x-icu" for encoding "WIN874" does not exist</>. </para> + + <para> + ICU allows collations to be customized beyond the basic language+country + set that is preloaded by <command>initdb</command>. Users are encouraged + to define their own collation objects that make use of these facilities to + suit the sorting behavior to their requirements. Here are some examples: + + <variablelist> + <varlistentry> + <term><literal>CREATE COLLATION "de-u-co-phonebk-x-icu" (provider = icu, locale = 'de-u-co-phonebk')</literal></term> + <listitem> + <para>German collation with phone book collation type</para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>CREATE COLLATION "und-u-co-emoji-x-icu" (provider = icu, locale = 'und-u-co-emoji')</literal></term> + <listitem> + <para> + Root collation with Emoji collation type, per Unicode Technical Standard #51 + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>CREATE COLLATION digitslast (provider = icu, locale = 'en-u-kr-latn-digit')</literal></term> + <listitem> + <para> + Sort digits after Latin letters. (The default is digits before letters.) + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>CREATE COLLATION upperfirst (provider = icu, locale = 'en-u-kf-upper')</literal></term> + <listitem> + <para> + Sort upper-case letters before lower-case letters. (The default is + lower-case letters first.) + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>CREATE COLLATION special (provider = icu, locale = 'en-u-kf-upper-kr-latn-digit')</literal></term> + <listitem> + <para> + Combines both of the above options. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>CREATE COLLATION numeric (provider = icu, locale = 'en-u-kn-true')</literal></term> + <listitem> + <para> + Numeric ordering, sorts sequences of digits by their numeric value, + for example: <literal>A-21</literal> < <literal>A-123</literal> + (also known as natural sort). + </para> + </listitem> + </varlistentry> + </variablelist> + + See <ulink url="http://unicode.org/reports/tr35/tr35-collation.html">Unicode + Technical Standard #35</ulink> + and <ulink url="https://tools.ietf.org/html/bcp47">BCP 47</ulink> for + details. The list of possible collation types (<literal>co</literal> + subtag) can be found in + the <ulink url="http://www.unicode.org/repos/cldr/trunk/common/bcp47/collation.xml">CLDR + repository</ulink>. + The <ulink url="https://ssl.icu-project.org/icu-bin/locexp">ICU Locale + Explorer</ulink> can be used to check the details of a particular locale + definition. + </para> + + <para> + Note that while this system allows creating collations that <quote>ignore + case</quote> or <quote>ignore accents</quote> or similar (using + the <literal>ks</literal> key), PostgreSQL does not at the moment allow + such collations to act in a truly case- or accent-insensitive manner. Any + strings that compare equal according to the collation but are not + byte-wise equal will be sorted according to their byte values. + </para> </sect4> </sect3> diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index d36ce535604..9437731276f 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -687,30 +687,11 @@ pg_import_system_collations(PG_FUNCTION_ARGS) */ for (i = -1; i < uloc_countAvailable(); i++) { - /* - * In ICU 4.2, ucol_getKeywordValuesForLocale() sometimes returns - * values that will not be accepted by uloc_toLanguageTag(). Skip - * loading keyword variants in that version. (Both - * ucol_getKeywordValuesForLocale() and uloc_toLanguageTag() are - * new in ICU 4.2, so older versions are not supported at all.) - * - * XXX We have no information about ICU 4.3 through 4.7, but we - * know the code below works with 4.8. - */ -#if U_ICU_VERSION_MAJOR_NUM > 4 || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM > 2) -#define LOAD_ICU_KEYWORD_VARIANTS -#endif - const char *name; char *langtag; char *icucomment; const char *collcollate; Oid collid; -#ifdef LOAD_ICU_KEYWORD_VARIANTS - UEnumeration *en; - UErrorCode status; - const char *val; -#endif if (i == -1) name = ""; /* ICU root locale */ @@ -744,58 +725,6 @@ pg_import_system_collations(PG_FUNCTION_ARGS) CreateComments(collid, CollationRelationId, 0, icucomment); } - - /* - * Add keyword variants, if enabled. - */ -#ifdef LOAD_ICU_KEYWORD_VARIANTS - status = U_ZERO_ERROR; - en = ucol_getKeywordValuesForLocale("collation", name, TRUE, &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("could not get keyword values for locale \"%s\": %s", - name, u_errorName(status)))); - - status = U_ZERO_ERROR; - uenum_reset(en, &status); - while ((val = uenum_next(en, NULL, &status))) - { - char *localeid = psprintf("%s@collation=%s", name, val); - - langtag = get_icu_language_tag(localeid); - collcollate = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : localeid; - - /* - * Be paranoid about not allowing any non-ASCII strings into - * pg_collation - */ - if (!is_all_ascii(langtag) || !is_all_ascii(collcollate)) - continue; - - collid = CollationCreate(psprintf("%s-x-icu", langtag), - nspid, GetUserId(), - COLLPROVIDER_ICU, -1, - collcollate, collcollate, - get_collation_actual_version(COLLPROVIDER_ICU, collcollate), - true, true); - if (OidIsValid(collid)) - { - ncreated++; - - CommandCounterIncrement(); - - icucomment = get_icu_locale_comment(localeid); - if (icucomment) - CreateComments(collid, CollationRelationId, 0, - icucomment); - } - } - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("could not get keyword values for locale \"%s\": %s", - name, u_errorName(status)))); - uenum_close(en); -#endif /* LOAD_ICU_KEYWORD_VARIANTS */ } } #endif /* USE_ICU */ |