diff options
author | Michael Paquier <michael@paquier.xyz> | 2020-10-11 19:09:01 +0900 |
---|---|---|
committer | Michael Paquier <michael@paquier.xyz> | 2020-10-11 19:09:01 +0900 |
commit | 80f8eb79e24d9b7963eaf17ce846667e2c6b6e6f (patch) | |
tree | c5ac3c2938b9fd81f8319a44429817454375ce1d /src/common/unicode_norm.c | |
parent | 85d08b8b721fb3b9359bca9325bc425cc95c30b1 (diff) | |
download | postgresql-80f8eb79e24d9b7963eaf17ce846667e2c6b6e6f.tar.gz postgresql-80f8eb79e24d9b7963eaf17ce846667e2c6b6e6f.zip |
Use perfect hash for NFC and NFKC Unicode Normalization quick check
This makes the normalization quick check about 30% faster for NFC and
50% faster for NFKC than the binary search used previously. The hash
lookup reuses the existing array of bit fields used for the binary
search to get the quick check property and is generated as part of "make
update-unicode" in src/common/unicode/.
Author: John Naylor
Reviewed-by: Mark Dilger, Michael Paquier
Discussion: https://postgr.es/m/CACPNZCt4fbJ0_bGrN5QPt34N4whv=mszM0LMVQdoa2rC9UMRXA@mail.gmail.com
Diffstat (limited to 'src/common/unicode_norm.c')
-rw-r--r-- | src/common/unicode_norm.c | 48 |
1 files changed, 27 insertions, 21 deletions
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c index ab5ce593456..626645ac870 100644 --- a/src/common/unicode_norm.c +++ b/src/common/unicode_norm.c @@ -465,15 +465,32 @@ get_canonical_class(pg_wchar ch) return entry->comb_class; } -static int -qc_compare(const void *p1, const void *p2) +static const pg_unicode_normprops * +qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo) { - uint32 v1, - v2; + int h; + uint32 hashkey; - v1 = ((const pg_unicode_normprops *) p1)->codepoint; - v2 = ((const pg_unicode_normprops *) p2)->codepoint; - return (v1 - v2); + /* + * Compute the hash function. The hash key is the codepoint with the bytes + * in network order. + */ + hashkey = htonl(ch); + h = norminfo->hash(&hashkey); + + /* An out-of-range result implies no match */ + if (h < 0 || h >= norminfo->num_normprops) + return NULL; + + /* + * Since it's a perfect hash, we need only match to the specific codepoint + * it identifies. + */ + if (ch != norminfo->normprops[h].codepoint) + return NULL; + + /* Success! */ + return &norminfo->normprops[h]; } /* @@ -482,26 +499,15 @@ qc_compare(const void *p1, const void *p2) static UnicodeNormalizationQC qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch) { - pg_unicode_normprops key; - pg_unicode_normprops *found = NULL; - - key.codepoint = ch; + const pg_unicode_normprops *found = NULL; switch (form) { case UNICODE_NFC: - found = bsearch(&key, - UnicodeNormProps_NFC_QC, - lengthof(UnicodeNormProps_NFC_QC), - sizeof(pg_unicode_normprops), - qc_compare); + found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC); break; case UNICODE_NFKC: - found = bsearch(&key, - UnicodeNormProps_NFKC_QC, - lengthof(UnicodeNormProps_NFKC_QC), - sizeof(pg_unicode_normprops), - qc_compare); + found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC); break; default: Assert(false); |