aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/varlena.c
diff options
context:
space:
mode:
authorPeter Eisentraut <peter@eisentraut.org>2020-03-26 08:14:00 +0100
committerPeter Eisentraut <peter@eisentraut.org>2020-04-02 08:56:27 +0200
commit2991ac5fc9b3904ca4582be6d323497d7c3d17c9 (patch)
treed558847de39ee972b261026d4846f1f31e8dff12 /src/backend/utils/adt/varlena.c
parent070c3d3937e75e04d36405287353b7eca516555d (diff)
downloadpostgresql-2991ac5fc9b3904ca4582be6d323497d7c3d17c9.tar.gz
postgresql-2991ac5fc9b3904ca4582be6d323497d7c3d17c9.zip
Add SQL functions for Unicode normalization
This adds SQL expressions NORMALIZE() and IS NORMALIZED to convert and check Unicode normal forms, per SQL standard. To support fast IS NORMALIZED tests, we pull in a new data file DerivedNormalizationProps.txt from Unicode and build a lookup table from that, using techniques similar to ones already used for other Unicode data. make update-unicode will keep it up to date. We only build and use these tables for the NFC and NFKC forms, because they are too big for NFD and NFKD and the improvement is not significant enough there. Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Andreas Karlsson <andreas@proxel.se> Discussion: https://www.postgresql.org/message-id/flat/c1909f27-c269-2ed9-12f8-3ab72c8caf7a@2ndquadrant.com
Diffstat (limited to 'src/backend/utils/adt/varlena.c')
-rw-r--r--src/backend/utils/adt/varlena.c150
1 files changed, 150 insertions, 0 deletions
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 907b5ab7b02..0e464950e15 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -22,6 +22,7 @@
#include "catalog/pg_type.h"
#include "common/hashfn.h"
#include "common/int.h"
+#include "common/unicode_norm.h"
#include "lib/hyperloglog.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
@@ -5976,3 +5977,152 @@ rest_of_char_same(const char *s1, const char *s2, int len)
#include "levenshtein.c"
#define LEVENSHTEIN_LESS_EQUAL
#include "levenshtein.c"
+
+
+/*
+ * Unicode support
+ */
+
+static UnicodeNormalizationForm
+unicode_norm_form_from_string(const char *formstr)
+{
+ UnicodeNormalizationForm form = -1;
+
+ /*
+ * Might as well check this while we're here.
+ */
+ if (GetDatabaseEncoding() != PG_UTF8)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
+
+ if (pg_strcasecmp(formstr, "NFC") == 0)
+ form = UNICODE_NFC;
+ else if (pg_strcasecmp(formstr, "NFD") == 0)
+ form = UNICODE_NFD;
+ else if (pg_strcasecmp(formstr, "NFKC") == 0)
+ form = UNICODE_NFKC;
+ else if (pg_strcasecmp(formstr, "NFKD") == 0)
+ form = UNICODE_NFKD;
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid normalization form: %s", formstr)));
+
+ return form;
+}
+
+Datum
+unicode_normalize_func(PG_FUNCTION_ARGS)
+{
+ text *input = PG_GETARG_TEXT_PP(0);
+ char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
+ UnicodeNormalizationForm form;
+ int size;
+ pg_wchar *input_chars;
+ pg_wchar *output_chars;
+ unsigned char *p;
+ text *result;
+ int i;
+
+ form = unicode_norm_form_from_string(formstr);
+
+ /* convert to pg_wchar */
+ size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
+ input_chars = palloc((size + 1) * sizeof(pg_wchar));
+ p = (unsigned char *) VARDATA_ANY(input);
+ for (i = 0; i < size; i++)
+ {
+ input_chars[i] = utf8_to_unicode(p);
+ p += pg_utf_mblen(p);
+ }
+ input_chars[i] = (pg_wchar) '\0';
+ Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+
+ /* action */
+ output_chars = unicode_normalize(form, input_chars);
+
+ /* convert back to UTF-8 string */
+ size = 0;
+ for (pg_wchar *wp = output_chars; *wp; wp++)
+ {
+ unsigned char buf[4];
+
+ unicode_to_utf8(*wp, buf);
+ size += pg_utf_mblen(buf);
+ }
+
+ result = palloc(size + VARHDRSZ);
+ SET_VARSIZE(result, size + VARHDRSZ);
+
+ p = (unsigned char *) VARDATA_ANY(result);
+ for (pg_wchar *wp = output_chars; *wp; wp++)
+ {
+ unicode_to_utf8(*wp, p);
+ p += pg_utf_mblen(p);
+ }
+ Assert((char *) p == (char *) result + size + VARHDRSZ);
+
+ PG_RETURN_TEXT_P(result);
+}
+
+/*
+ * Check whether the string is in the specified Unicode normalization form.
+ *
+ * This is done by convering the string to the specified normal form and then
+ * comparing that to the original string. To speed that up, we also apply the
+ * "quick check" algorithm specified in UAX #15, which can give a yes or no
+ * answer for many strings by just scanning the string once.
+ *
+ * This function should generally be optimized for the case where the string
+ * is in fact normalized. In that case, we'll end up looking at the entire
+ * string, so it's probably not worth doing any incremental conversion etc.
+ */
+Datum
+unicode_is_normalized(PG_FUNCTION_ARGS)
+{
+ text *input = PG_GETARG_TEXT_PP(0);
+ char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
+ UnicodeNormalizationForm form;
+ int size;
+ pg_wchar *input_chars;
+ pg_wchar *output_chars;
+ unsigned char *p;
+ int i;
+ UnicodeNormalizationQC quickcheck;
+ int output_size;
+ bool result;
+
+ form = unicode_norm_form_from_string(formstr);
+
+ /* convert to pg_wchar */
+ size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
+ input_chars = palloc((size + 1) * sizeof(pg_wchar));
+ p = (unsigned char *) VARDATA_ANY(input);
+ for (i = 0; i < size; i++)
+ {
+ input_chars[i] = utf8_to_unicode(p);
+ p += pg_utf_mblen(p);
+ }
+ input_chars[i] = (pg_wchar) '\0';
+ Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+
+ /* quick check (see UAX #15) */
+ quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
+ if (quickcheck == UNICODE_NORM_QC_YES)
+ PG_RETURN_BOOL(true);
+ else if (quickcheck == UNICODE_NORM_QC_NO)
+ PG_RETURN_BOOL(false);
+
+ /* normalize and compare with original */
+ output_chars = unicode_normalize(form, input_chars);
+
+ output_size = 0;
+ for (pg_wchar *wp = output_chars; *wp; wp++)
+ output_size++;
+
+ result = (size == output_size) &&
+ (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
+
+ PG_RETURN_BOOL(result);
+}