diff options
Diffstat (limited to 'src/backend/utils/adt/varlena.c')
-rw-r--r-- | src/backend/utils/adt/varlena.c | 150 |
1 files changed, 150 insertions, 0 deletions
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 907b5ab7b02..0e464950e15 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -22,6 +22,7 @@ #include "catalog/pg_type.h" #include "common/hashfn.h" #include "common/int.h" +#include "common/unicode_norm.h" #include "lib/hyperloglog.h" #include "libpq/pqformat.h" #include "miscadmin.h" @@ -5976,3 +5977,152 @@ rest_of_char_same(const char *s1, const char *s2, int len) #include "levenshtein.c" #define LEVENSHTEIN_LESS_EQUAL #include "levenshtein.c" + + +/* + * Unicode support + */ + +static UnicodeNormalizationForm +unicode_norm_form_from_string(const char *formstr) +{ + UnicodeNormalizationForm form = -1; + + /* + * Might as well check this while we're here. + */ + if (GetDatabaseEncoding() != PG_UTF8) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Unicode normalization can only be performed if server encoding is UTF8"))); + + if (pg_strcasecmp(formstr, "NFC") == 0) + form = UNICODE_NFC; + else if (pg_strcasecmp(formstr, "NFD") == 0) + form = UNICODE_NFD; + else if (pg_strcasecmp(formstr, "NFKC") == 0) + form = UNICODE_NFKC; + else if (pg_strcasecmp(formstr, "NFKD") == 0) + form = UNICODE_NFKD; + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid normalization form: %s", formstr))); + + return form; +} + +Datum +unicode_normalize_func(PG_FUNCTION_ARGS) +{ + text *input = PG_GETARG_TEXT_PP(0); + char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1)); + UnicodeNormalizationForm form; + int size; + pg_wchar *input_chars; + pg_wchar *output_chars; + unsigned char *p; + text *result; + int i; + + form = unicode_norm_form_from_string(formstr); + + /* convert to pg_wchar */ + size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); + input_chars = palloc((size + 1) * sizeof(pg_wchar)); + p = (unsigned char *) VARDATA_ANY(input); + for (i = 0; i < size; i++) + { + input_chars[i] = utf8_to_unicode(p); + p += pg_utf_mblen(p); + } + input_chars[i] = (pg_wchar) '\0'; + Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input)); + + /* action */ + output_chars = unicode_normalize(form, input_chars); + + /* convert back to UTF-8 string */ + size = 0; + for (pg_wchar *wp = output_chars; *wp; wp++) + { + unsigned char buf[4]; + + unicode_to_utf8(*wp, buf); + size += pg_utf_mblen(buf); + } + + result = palloc(size + VARHDRSZ); + SET_VARSIZE(result, size + VARHDRSZ); + + p = (unsigned char *) VARDATA_ANY(result); + for (pg_wchar *wp = output_chars; *wp; wp++) + { + unicode_to_utf8(*wp, p); + p += pg_utf_mblen(p); + } + Assert((char *) p == (char *) result + size + VARHDRSZ); + + PG_RETURN_TEXT_P(result); +} + +/* + * Check whether the string is in the specified Unicode normalization form. + * + * This is done by convering the string to the specified normal form and then + * comparing that to the original string. To speed that up, we also apply the + * "quick check" algorithm specified in UAX #15, which can give a yes or no + * answer for many strings by just scanning the string once. + * + * This function should generally be optimized for the case where the string + * is in fact normalized. In that case, we'll end up looking at the entire + * string, so it's probably not worth doing any incremental conversion etc. + */ +Datum +unicode_is_normalized(PG_FUNCTION_ARGS) +{ + text *input = PG_GETARG_TEXT_PP(0); + char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1)); + UnicodeNormalizationForm form; + int size; + pg_wchar *input_chars; + pg_wchar *output_chars; + unsigned char *p; + int i; + UnicodeNormalizationQC quickcheck; + int output_size; + bool result; + + form = unicode_norm_form_from_string(formstr); + + /* convert to pg_wchar */ + size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); + input_chars = palloc((size + 1) * sizeof(pg_wchar)); + p = (unsigned char *) VARDATA_ANY(input); + for (i = 0; i < size; i++) + { + input_chars[i] = utf8_to_unicode(p); + p += pg_utf_mblen(p); + } + input_chars[i] = (pg_wchar) '\0'; + Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input)); + + /* quick check (see UAX #15) */ + quickcheck = unicode_is_normalized_quickcheck(form, input_chars); + if (quickcheck == UNICODE_NORM_QC_YES) + PG_RETURN_BOOL(true); + else if (quickcheck == UNICODE_NORM_QC_NO) + PG_RETURN_BOOL(false); + + /* normalize and compare with original */ + output_chars = unicode_normalize(form, input_chars); + + output_size = 0; + for (pg_wchar *wp = output_chars; *wp; wp++) + output_size++; + + result = (size == output_size) && + (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0); + + PG_RETURN_BOOL(result); +} |