1 files changed, 108 insertions, 0 deletions
diff --git a/src/common/unicode/category_test.c b/src/common/unicode/category_test.c
new file mode 100644
index 00000000000..ba62716d456
--- /dev/null
+++ b/src/common/unicode/category_test.c
@@ -0,0 +1,108 @@
+/*-------------------------------------------------------------------------
+ * category_test.c
+ *		Program to test Unicode general category functions.
+ *
+ * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/common/unicode/category_test.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres_fe.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef USE_ICU
+#include <unicode/uchar.h>
+#endif
+#include "common/unicode_category.h"
+#include "common/unicode_version.h"
+
+/*
+ * Parse version into integer for easy comparison.
+ */
+#ifdef USE_ICU
+static int
+parse_unicode_version(const char *version)
+{
+	int			n,
+				major,
+				minor;
+
+	n = sscanf(version, "%d.%d", &major, &minor);
+
+	Assert(n == 2);
+	Assert(minor < 100);
+
+	return major * 100 + minor;
+}
+#endif
+
+/*
+ * Exhaustively test that the Unicode category for each codepoint matches that
+ * returned by ICU.
+ */
+int
+main(int argc, char **argv)
+{
+#ifdef USE_ICU
+	int			pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION);
+	int			icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
+	int			pg_skipped_codepoints = 0;
+	int			icu_skipped_codepoints = 0;
+
+	printf("Postgres Unicode Version:\t%s\n", PG_UNICODE_VERSION);
+	printf("ICU Unicode Version:\t\t%s\n", U_UNICODE_VERSION);
+
+	for (UChar32 code = 0; code <= 0x10ffff; code++)
+	{
+		uint8_t		pg_category = unicode_category(code);
+		uint8_t		icu_category = u_charType(code);
+
+		if (pg_category != icu_category)
+		{
+			/*
+			 * A version mismatch means that some assigned codepoints in the
+			 * newer version may be unassigned in the older version. That's
+			 * OK, though the test will not cover those codepoints marked
+			 * unassigned in the older version (that is, it will no longer be
+			 * an exhaustive test).
+			 */
+			if (pg_category == PG_U_UNASSIGNED &&
+				pg_unicode_version < icu_unicode_version)
+				pg_skipped_codepoints++;
+			else if (icu_category == PG_U_UNASSIGNED &&
+					 icu_unicode_version < pg_unicode_version)
+				icu_skipped_codepoints++;
+			else
+			{
+				printf("FAILURE for codepoint %06x\n", code);
+				printf("Postgres category:	%02d %s %s\n", pg_category,
+					   unicode_category_abbrev(pg_category),
+					   unicode_category_string(pg_category));
+				printf("ICU category:		%02d %s %s\n", icu_category,
+					   unicode_category_abbrev(icu_category),
+					   unicode_category_string(icu_category));
+				printf("\n");
+				exit(1);
+			}
+		}
+	}
+
+	if (pg_skipped_codepoints > 0)
+		printf("Skipped %d codepoints unassigned in Postgres due to Unicode version mismatch.\n",
+			   pg_skipped_codepoints);
+	if (icu_skipped_codepoints > 0)
+		printf("Skipped %d codepoints unassigned in ICU due to Unicode version mismatch.\n",
+			   icu_skipped_codepoints);
+
+	printf("category_test: All tests successful!\n");
+	exit(0);
+#else
+	printf("ICU support required for test; skipping.\n");
+	exit(0);
+#endif
+}