aboutsummaryrefslogtreecommitdiff
path: root/src/common/unicode_norm.c
diff options
context:
space:
mode:
authorMichael Paquier <michael@paquier.xyz>2020-10-11 19:09:01 +0900
committerMichael Paquier <michael@paquier.xyz>2020-10-11 19:09:01 +0900
commit80f8eb79e24d9b7963eaf17ce846667e2c6b6e6f (patch)
treec5ac3c2938b9fd81f8319a44429817454375ce1d /src/common/unicode_norm.c
parent85d08b8b721fb3b9359bca9325bc425cc95c30b1 (diff)
downloadpostgresql-80f8eb79e24d9b7963eaf17ce846667e2c6b6e6f.tar.gz
postgresql-80f8eb79e24d9b7963eaf17ce846667e2c6b6e6f.zip
Use perfect hash for NFC and NFKC Unicode Normalization quick check
This makes the normalization quick check about 30% faster for NFC and 50% faster for NFKC than the binary search used previously. The hash lookup reuses the existing array of bit fields used for the binary search to get the quick check property and is generated as part of "make update-unicode" in src/common/unicode/. Author: John Naylor Reviewed-by: Mark Dilger, Michael Paquier Discussion: https://postgr.es/m/CACPNZCt4fbJ0_bGrN5QPt34N4whv=mszM0LMVQdoa2rC9UMRXA@mail.gmail.com
Diffstat (limited to 'src/common/unicode_norm.c')
-rw-r--r--src/common/unicode_norm.c48
1 files changed, 27 insertions, 21 deletions
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
index ab5ce593456..626645ac870 100644
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@@ -465,15 +465,32 @@ get_canonical_class(pg_wchar ch)
return entry->comb_class;
}
-static int
-qc_compare(const void *p1, const void *p2)
+static const pg_unicode_normprops *
+qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
{
- uint32 v1,
- v2;
+ int h;
+ uint32 hashkey;
- v1 = ((const pg_unicode_normprops *) p1)->codepoint;
- v2 = ((const pg_unicode_normprops *) p2)->codepoint;
- return (v1 - v2);
+ /*
+ * Compute the hash function. The hash key is the codepoint with the bytes
+ * in network order.
+ */
+ hashkey = htonl(ch);
+ h = norminfo->hash(&hashkey);
+
+ /* An out-of-range result implies no match */
+ if (h < 0 || h >= norminfo->num_normprops)
+ return NULL;
+
+ /*
+ * Since it's a perfect hash, we need only match to the specific codepoint
+ * it identifies.
+ */
+ if (ch != norminfo->normprops[h].codepoint)
+ return NULL;
+
+ /* Success! */
+ return &norminfo->normprops[h];
}
/*
@@ -482,26 +499,15 @@ qc_compare(const void *p1, const void *p2)
static UnicodeNormalizationQC
qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
{
- pg_unicode_normprops key;
- pg_unicode_normprops *found = NULL;
-
- key.codepoint = ch;
+ const pg_unicode_normprops *found = NULL;
switch (form)
{
case UNICODE_NFC:
- found = bsearch(&key,
- UnicodeNormProps_NFC_QC,
- lengthof(UnicodeNormProps_NFC_QC),
- sizeof(pg_unicode_normprops),
- qc_compare);
+ found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
break;
case UNICODE_NFKC:
- found = bsearch(&key,
- UnicodeNormProps_NFKC_QC,
- lengthof(UnicodeNormProps_NFKC_QC),
- sizeof(pg_unicode_normprops),
- qc_compare);
+ found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
break;
default:
Assert(false);