Use perfect hash for NFC and NFKC Unicode Normalization quick check

This makes the normalization quick check about 30% faster for NFC and 50% faster for NFKC than the binary search used previously. The hash lookup reuses the existing array of bit fields used for the binary search to get the quick check property and is generated as part of "make update-unicode" in src/common/unicode/. Author: John Naylor Reviewed-by: Mark Dilger, Michael Paquier Discussion: https://postgr.es/m/CACPNZCt4fbJ0_bGrN5QPt34N4whv=mszM0LMVQdoa2rC9UMRXA@mail.gmail.com
author: Michael Paquier <michael@paquier.xyz> 2020-10-11 19:09:01 +0900
committer: Michael Paquier <michael@paquier.xyz> 2020-10-11 19:09:01 +0900
commit: 80f8eb79e24d9b7963eaf17ce846667e2c6b6e6f (patch)
tree: c5ac3c2938b9fd81f8319a44429817454375ce1d /src/common/unicode_norm.c
parent: 85d08b8b721fb3b9359bca9325bc425cc95c30b1 (diff)
download: postgresql-80f8eb79e24d9b7963eaf17ce846667e2c6b6e6f.tar.gz
postgresql-80f8eb79e24d9b7963eaf17ce846667e2c6b6e6f.zip
1 files changed, 27 insertions, 21 deletions
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
index ab5ce593456..626645ac870 100644
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@@ -465,15 +465,32 @@ get_canonical_class(pg_wchar ch)
 		return entry->comb_class;
 }
 
-static int
-qc_compare(const void *p1, const void *p2)
+static const pg_unicode_normprops *
+qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
 {
-	uint32		v1,
-				v2;
+	int			h;
+	uint32		hashkey;
 
-	v1 = ((const pg_unicode_normprops *) p1)->codepoint;
-	v2 = ((const pg_unicode_normprops *) p2)->codepoint;
-	return (v1 - v2);
+	/*
+	 * Compute the hash function. The hash key is the codepoint with the bytes
+	 * in network order.
+	 */
+	hashkey = htonl(ch);
+	h = norminfo->hash(&hashkey);
+
+	/* An out-of-range result implies no match */
+	if (h < 0 || h >= norminfo->num_normprops)
+		return NULL;
+
+	/*
+	 * Since it's a perfect hash, we need only match to the specific codepoint
+	 * it identifies.
+	 */
+	if (ch != norminfo->normprops[h].codepoint)
+		return NULL;
+
+	/* Success! */
+	return &norminfo->normprops[h];
 }
 
 /*
@@ -482,26 +499,15 @@ qc_compare(const void *p1, const void *p2)
 static UnicodeNormalizationQC
 qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
 {
-	pg_unicode_normprops key;
-	pg_unicode_normprops *found = NULL;
-
-	key.codepoint = ch;
+	const pg_unicode_normprops *found = NULL;
 
 	switch (form)
 	{
 		case UNICODE_NFC:
-			found = bsearch(&key,
-							UnicodeNormProps_NFC_QC,
-							lengthof(UnicodeNormProps_NFC_QC),
-							sizeof(pg_unicode_normprops),
-							qc_compare);
+			found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
 			break;
 		case UNICODE_NFKC:
-			found = bsearch(&key,
-							UnicodeNormProps_NFKC_QC,
-							lengthof(UnicodeNormProps_NFKC_QC),
-							sizeof(pg_unicode_normprops),
-							qc_compare);
+			found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
 			break;
 		default:
 			Assert(false);
author	Michael Paquier <michael@paquier.xyz>	2020-10-11 19:09:01 +0900
committer	Michael Paquier <michael@paquier.xyz>	2020-10-11 19:09:01 +0900
commit	80f8eb79e24d9b7963eaf17ce846667e2c6b6e6f (patch)
tree	c5ac3c2938b9fd81f8319a44429817454375ce1d /src/common/unicode_norm.c
parent	85d08b8b721fb3b9359bca9325bc425cc95c30b1 (diff)
download	postgresql-80f8eb79e24d9b7963eaf17ce846667e2c6b6e6f.tar.gz postgresql-80f8eb79e24d9b7963eaf17ce846667e2c6b6e6f.zip