aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/varlena.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/adt/varlena.c')
-rw-r--r--src/backend/utils/adt/varlena.c150
1 files changed, 150 insertions, 0 deletions
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 907b5ab7b02..0e464950e15 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -22,6 +22,7 @@
#include "catalog/pg_type.h"
#include "common/hashfn.h"
#include "common/int.h"
+#include "common/unicode_norm.h"
#include "lib/hyperloglog.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
@@ -5976,3 +5977,152 @@ rest_of_char_same(const char *s1, const char *s2, int len)
#include "levenshtein.c"
#define LEVENSHTEIN_LESS_EQUAL
#include "levenshtein.c"
+
+
+/*
+ * Unicode support
+ */
+
+static UnicodeNormalizationForm
+unicode_norm_form_from_string(const char *formstr)
+{
+ UnicodeNormalizationForm form = -1;
+
+ /*
+ * Might as well check this while we're here.
+ */
+ if (GetDatabaseEncoding() != PG_UTF8)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
+
+ if (pg_strcasecmp(formstr, "NFC") == 0)
+ form = UNICODE_NFC;
+ else if (pg_strcasecmp(formstr, "NFD") == 0)
+ form = UNICODE_NFD;
+ else if (pg_strcasecmp(formstr, "NFKC") == 0)
+ form = UNICODE_NFKC;
+ else if (pg_strcasecmp(formstr, "NFKD") == 0)
+ form = UNICODE_NFKD;
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid normalization form: %s", formstr)));
+
+ return form;
+}
+
+Datum
+unicode_normalize_func(PG_FUNCTION_ARGS)
+{
+ text *input = PG_GETARG_TEXT_PP(0);
+ char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
+ UnicodeNormalizationForm form;
+ int size;
+ pg_wchar *input_chars;
+ pg_wchar *output_chars;
+ unsigned char *p;
+ text *result;
+ int i;
+
+ form = unicode_norm_form_from_string(formstr);
+
+ /* convert to pg_wchar */
+ size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
+ input_chars = palloc((size + 1) * sizeof(pg_wchar));
+ p = (unsigned char *) VARDATA_ANY(input);
+ for (i = 0; i < size; i++)
+ {
+ input_chars[i] = utf8_to_unicode(p);
+ p += pg_utf_mblen(p);
+ }
+ input_chars[i] = (pg_wchar) '\0';
+ Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+
+ /* action */
+ output_chars = unicode_normalize(form, input_chars);
+
+ /* convert back to UTF-8 string */
+ size = 0;
+ for (pg_wchar *wp = output_chars; *wp; wp++)
+ {
+ unsigned char buf[4];
+
+ unicode_to_utf8(*wp, buf);
+ size += pg_utf_mblen(buf);
+ }
+
+ result = palloc(size + VARHDRSZ);
+ SET_VARSIZE(result, size + VARHDRSZ);
+
+ p = (unsigned char *) VARDATA_ANY(result);
+ for (pg_wchar *wp = output_chars; *wp; wp++)
+ {
+ unicode_to_utf8(*wp, p);
+ p += pg_utf_mblen(p);
+ }
+ Assert((char *) p == (char *) result + size + VARHDRSZ);
+
+ PG_RETURN_TEXT_P(result);
+}
+
+/*
+ * Check whether the string is in the specified Unicode normalization form.
+ *
+ * This is done by convering the string to the specified normal form and then
+ * comparing that to the original string. To speed that up, we also apply the
+ * "quick check" algorithm specified in UAX #15, which can give a yes or no
+ * answer for many strings by just scanning the string once.
+ *
+ * This function should generally be optimized for the case where the string
+ * is in fact normalized. In that case, we'll end up looking at the entire
+ * string, so it's probably not worth doing any incremental conversion etc.
+ */
+Datum
+unicode_is_normalized(PG_FUNCTION_ARGS)
+{
+ text *input = PG_GETARG_TEXT_PP(0);
+ char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
+ UnicodeNormalizationForm form;
+ int size;
+ pg_wchar *input_chars;
+ pg_wchar *output_chars;
+ unsigned char *p;
+ int i;
+ UnicodeNormalizationQC quickcheck;
+ int output_size;
+ bool result;
+
+ form = unicode_norm_form_from_string(formstr);
+
+ /* convert to pg_wchar */
+ size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
+ input_chars = palloc((size + 1) * sizeof(pg_wchar));
+ p = (unsigned char *) VARDATA_ANY(input);
+ for (i = 0; i < size; i++)
+ {
+ input_chars[i] = utf8_to_unicode(p);
+ p += pg_utf_mblen(p);
+ }
+ input_chars[i] = (pg_wchar) '\0';
+ Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+
+ /* quick check (see UAX #15) */
+ quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
+ if (quickcheck == UNICODE_NORM_QC_YES)
+ PG_RETURN_BOOL(true);
+ else if (quickcheck == UNICODE_NORM_QC_NO)
+ PG_RETURN_BOOL(false);
+
+ /* normalize and compare with original */
+ output_chars = unicode_normalize(form, input_chars);
+
+ output_size = 0;
+ for (pg_wchar *wp = output_chars; *wp; wp++)
+ output_size++;
+
+ result = (size == output_size) &&
+ (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
+
+ PG_RETURN_BOOL(result);
+}