Add support for other normal forms to Unicode normalization API

It previously only supported NFKC, for use by SASLprep. This expands the API to offer the choice of all four normalization forms. Right now, there are no internal users of the forms other than NFKC. Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Andreas Karlsson <andreas@proxel.se> Discussion: https://www.postgresql.org/message-id/flat/c1909f27-c269-2ed9-12f8-3ab72c8caf7a@2ndquadrant.com
author: Peter Eisentraut <peter@eisentraut.org> 2020-03-24 08:49:52 +0100
committer: Peter Eisentraut <peter@eisentraut.org> 2020-03-24 10:02:46 +0100
commit: d40d564c5a920e1121b28463dfed74441cbae5c0 (patch)
tree: 7a3cc5d8353896fb5abafeee9eaf63952beb2563 /src/common/unicode_norm.c
parent: cedffbdb8b137325a79e07a976457bc2314adf9b (diff)
download: postgresql-d40d564c5a920e1121b28463dfed74441cbae5c0.tar.gz
postgresql-d40d564c5a920e1121b28463dfed74441cbae5c0.zip
1 files changed, 19 insertions, 12 deletions
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
index ab56490a075..ec5abea6bdd 100644
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@@ -1,6 +1,6 @@
 /*-------------------------------------------------------------------------
  * unicode_norm.c
- *		Normalize a Unicode string to NFKC form
+ *		Normalize a Unicode string
  *
  * This implements Unicode normalization, per the documentation at
  * https://www.unicode.org/reports/tr15/.
@@ -98,7 +98,7 @@ get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
  * are, in turn, decomposable.
  */
 static int
-get_decomposed_size(pg_wchar code)
+get_decomposed_size(pg_wchar code, bool compat)
 {
 	pg_unicode_decomposition *entry;
 	int			size = 0;
@@ -131,7 +131,8 @@ get_decomposed_size(pg_wchar code)
 	 * Just count current code if no other decompositions.  A NULL entry is
 	 * equivalent to a character with class 0 and no decompositions.
 	 */
-	if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
+	if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
+		(!compat && DECOMPOSITION_IS_COMPAT(entry)))
 		return 1;
 
 	/*
@@ -143,7 +144,7 @@ get_decomposed_size(pg_wchar code)
 	{
 		uint32		lcode = decomp[i];
 
-		size += get_decomposed_size(lcode);
+		size += get_decomposed_size(lcode, compat);
 	}
 
 	return size;
@@ -224,7 +225,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
  * in the array result.
  */
 static void
-decompose_code(pg_wchar code, pg_wchar **result, int *current)
+decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
 {
 	pg_unicode_decomposition *entry;
 	int			i;
@@ -272,7 +273,8 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
 	 * character with class 0 and no decompositions, so just leave also in
 	 * this case.
 	 */
-	if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
+	if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
+		(!compat && DECOMPOSITION_IS_COMPAT(entry)))
 	{
 		pg_wchar   *res = *result;
 
@@ -290,12 +292,12 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
 		pg_wchar	lcode = (pg_wchar) decomp[i];
 
 		/* Leave if no more decompositions */
-		decompose_code(lcode, result, current);
+		decompose_code(lcode, compat, result, current);
 	}
 }
 
 /*
- * unicode_normalize_kc - Normalize a Unicode string to NFKC form.
+ * unicode_normalize - Normalize a Unicode string to the specified form.
  *
  * The input is a 0-terminated array of codepoints.
  *
@@ -304,8 +306,10 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
  * string is palloc'd instead, and OOM is reported with ereport().
  */
 pg_wchar *
-unicode_normalize_kc(const pg_wchar *input)
+unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
 {
+	bool		compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
+	bool		recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
 	pg_wchar   *decomp_chars;
 	pg_wchar   *recomp_chars;
 	int			decomp_size,
@@ -326,7 +330,7 @@ unicode_normalize_kc(const pg_wchar *input)
 	 */
 	decomp_size = 0;
 	for (p = input; *p; p++)
-		decomp_size += get_decomposed_size(*p);
+		decomp_size += get_decomposed_size(*p, compat);
 
 	decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
 	if (decomp_chars == NULL)
@@ -338,7 +342,7 @@ unicode_normalize_kc(const pg_wchar *input)
 	 */
 	current_size = 0;
 	for (p = input; *p; p++)
-		decompose_code(*p, &decomp_chars, &current_size);
+		decompose_code(*p, compat, &decomp_chars, &current_size);
 	decomp_chars[decomp_size] = '\0';
 	Assert(decomp_size == current_size);
 
@@ -385,8 +389,11 @@ unicode_normalize_kc(const pg_wchar *input)
 			count -= 2;
 	}
 
+	if (!recompose)
+		return decomp_chars;
+
 	/*
-	 * The last phase of NFKC is the recomposition of the reordered Unicode
+	 * The last phase of NFC and NFKC is the recomposition of the reordered Unicode
 	 * string using combining classes. The recomposed string cannot be longer
 	 * than the decomposed one, so make the allocation of the output string
 	 * based on that assumption.
author	Peter Eisentraut <peter@eisentraut.org>	2020-03-24 08:49:52 +0100
committer	Peter Eisentraut <peter@eisentraut.org>	2020-03-24 10:02:46 +0100
commit	d40d564c5a920e1121b28463dfed74441cbae5c0 (patch)
tree	7a3cc5d8353896fb5abafeee9eaf63952beb2563 /src/common/unicode_norm.c
parent	cedffbdb8b137325a79e07a976457bc2314adf9b (diff)
download	postgresql-d40d564c5a920e1121b28463dfed74441cbae5c0.tar.gz postgresql-d40d564c5a920e1121b28463dfed74441cbae5c0.zip