src/common/unicode/category_test.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236

/*-------------------------------------------------------------------------
 * category_test.c
 *		Program to test Unicode general category and character properties.
 *
 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  src/common/unicode/category_test.c
 *
 *-------------------------------------------------------------------------
 */
#include "postgres_fe.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wctype.h>

#ifdef USE_ICU
#include <unicode/uchar.h>
#endif

#include "common/unicode_category.h"
#include "common/unicode_version.h"

static int	pg_unicode_version = 0;
#ifdef USE_ICU
static int	icu_unicode_version = 0;
#endif

/*
 * Parse version into integer for easy comparison.
 */
static int
parse_unicode_version(const char *version)
{
	int			n PG_USED_FOR_ASSERTS_ONLY;
	int			major;
	int			minor;

	n = sscanf(version, "%d.%d", &major, &minor);

	Assert(n == 2);
	Assert(minor < 100);

	return major * 100 + minor;
}

#ifdef USE_ICU
/*
 * Test Postgres Unicode tables by comparing with ICU. Test the General
 * Category, as well as the properties Alphabetic, Lowercase, Uppercase,
 * White_Space, and Hex_Digit.
 */
static void
icu_test()
{
	int			successful = 0;
	int			pg_skipped_codepoints = 0;
	int			icu_skipped_codepoints = 0;

	for (pg_wchar code = 0; code <= 0x10ffff; code++)
	{
		uint8_t		pg_category = unicode_category(code);
		uint8_t		icu_category = u_charType(code);

		/* Property tests */
		bool		prop_alphabetic = pg_u_prop_alphabetic(code);
		bool		prop_lowercase = pg_u_prop_lowercase(code);
		bool		prop_uppercase = pg_u_prop_uppercase(code);
		bool		prop_cased = pg_u_prop_cased(code);
		bool		prop_case_ignorable = pg_u_prop_case_ignorable(code);
		bool		prop_white_space = pg_u_prop_white_space(code);
		bool		prop_hex_digit = pg_u_prop_hex_digit(code);
		bool		prop_join_control = pg_u_prop_join_control(code);

		bool		icu_prop_alphabetic = u_hasBinaryProperty(code, UCHAR_ALPHABETIC);
		bool		icu_prop_lowercase = u_hasBinaryProperty(code, UCHAR_LOWERCASE);
		bool		icu_prop_uppercase = u_hasBinaryProperty(code, UCHAR_UPPERCASE);
		bool		icu_prop_cased = u_hasBinaryProperty(code, UCHAR_CASED);
		bool		icu_prop_case_ignorable = u_hasBinaryProperty(code, UCHAR_CASE_IGNORABLE);
		bool		icu_prop_white_space = u_hasBinaryProperty(code, UCHAR_WHITE_SPACE);
		bool		icu_prop_hex_digit = u_hasBinaryProperty(code, UCHAR_HEX_DIGIT);
		bool		icu_prop_join_control = u_hasBinaryProperty(code, UCHAR_JOIN_CONTROL);

		/*
		 * Compare with ICU for character classes using:
		 *
		 * https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/uchar_8h.html#details
		 *
		 * which describes how to use ICU to test for membership in regex
		 * character classes.
		 *
		 * NB: the document suggests testing for some properties such as
		 * UCHAR_POSIX_ALNUM, but that doesn't mean that we're testing for the
		 * "POSIX Compatible" character classes.
		 */
		bool		isalpha = pg_u_isalpha(code);
		bool		islower = pg_u_islower(code);
		bool		isupper = pg_u_isupper(code);
		bool		ispunct = pg_u_ispunct(code, false);
		bool		isdigit = pg_u_isdigit(code, false);
		bool		isxdigit = pg_u_isxdigit(code, false);
		bool		isalnum = pg_u_isalnum(code, false);
		bool		isspace = pg_u_isspace(code);
		bool		isblank = pg_u_isblank(code);
		bool		iscntrl = pg_u_iscntrl(code);
		bool		isgraph = pg_u_isgraph(code);
		bool		isprint = pg_u_isprint(code);

		bool		icu_isalpha = u_isUAlphabetic(code);
		bool		icu_islower = u_isULowercase(code);
		bool		icu_isupper = u_isUUppercase(code);
		bool		icu_ispunct = u_ispunct(code);
		bool		icu_isdigit = u_isdigit(code);
		bool		icu_isxdigit = u_hasBinaryProperty(code,
													   UCHAR_POSIX_XDIGIT);
		bool		icu_isalnum = u_hasBinaryProperty(code,
													  UCHAR_POSIX_ALNUM);
		bool		icu_isspace = u_isUWhiteSpace(code);
		bool		icu_isblank = u_isblank(code);
		bool		icu_iscntrl = icu_category == PG_U_CONTROL;
		bool		icu_isgraph = u_hasBinaryProperty(code,
													  UCHAR_POSIX_GRAPH);
		bool		icu_isprint = u_hasBinaryProperty(code,
													  UCHAR_POSIX_PRINT);

		/*
		 * A version mismatch means that some assigned codepoints in the newer
		 * version may be unassigned in the older version. That's OK, though
		 * the test will not cover those codepoints marked unassigned in the
		 * older version (that is, it will no longer be an exhaustive test).
		 */
		if (pg_category == PG_U_UNASSIGNED &&
			icu_category != PG_U_UNASSIGNED &&
			pg_unicode_version < icu_unicode_version)
		{
			pg_skipped_codepoints++;
			continue;
		}

		if (icu_category == PG_U_UNASSIGNED &&
			pg_category != PG_U_UNASSIGNED &&
			icu_unicode_version < pg_unicode_version)
		{
			icu_skipped_codepoints++;
			continue;
		}

		if (pg_category != icu_category)
		{
			printf("category_test: FAILURE for codepoint 0x%06x\n", code);
			printf("category_test: Postgres category:	%02d %s %s\n", pg_category,
				   unicode_category_abbrev(pg_category),
				   unicode_category_string(pg_category));
			printf("category_test: ICU category:		%02d %s %s\n", icu_category,
				   unicode_category_abbrev(icu_category),
				   unicode_category_string(icu_category));
			printf("\n");
			exit(1);
		}

		if (prop_alphabetic != icu_prop_alphabetic ||
			prop_lowercase != icu_prop_lowercase ||
			prop_uppercase != icu_prop_uppercase ||
			prop_cased != icu_prop_cased ||
			prop_case_ignorable != icu_prop_case_ignorable ||
			prop_white_space != icu_prop_white_space ||
			prop_hex_digit != icu_prop_hex_digit ||
			prop_join_control != icu_prop_join_control)
		{
			printf("category_test: FAILURE for codepoint 0x%06x\n", code);
			printf("category_test: Postgres	property	alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n",
				   prop_alphabetic, prop_lowercase, prop_uppercase,
				   prop_cased, prop_case_ignorable,
				   prop_white_space, prop_hex_digit, prop_join_control);
			printf("category_test: ICU	property	alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n",
				   icu_prop_alphabetic, icu_prop_lowercase, icu_prop_uppercase,
				   icu_prop_cased, icu_prop_case_ignorable,
				   icu_prop_white_space, icu_prop_hex_digit, icu_prop_join_control);
			printf("\n");
			exit(1);
		}

		if (isalpha != icu_isalpha ||
			islower != icu_islower ||
			isupper != icu_isupper ||
			ispunct != icu_ispunct ||
			isdigit != icu_isdigit ||
			isxdigit != icu_isxdigit ||
			isalnum != icu_isalnum ||
			isspace != icu_isspace ||
			isblank != icu_isblank ||
			iscntrl != icu_iscntrl ||
			isgraph != icu_isgraph ||
			isprint != icu_isprint)
		{
			printf("category_test: FAILURE for codepoint 0x%06x\n", code);
			printf("category_test: Postgres	class	alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n",
				   isalpha, islower, isupper, ispunct, isdigit, isxdigit, isalnum, isspace, isblank, iscntrl, isgraph, isprint);
			printf("category_test: ICU class	alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n",
				   icu_isalpha, icu_islower, icu_isupper, icu_ispunct, icu_isdigit, icu_isxdigit, icu_isalnum, icu_isspace, icu_isblank, icu_iscntrl, icu_isgraph, icu_isprint);
			printf("\n");
			exit(1);
		}

		if (pg_category != PG_U_UNASSIGNED)
			successful++;
	}

	if (pg_skipped_codepoints > 0)
		printf("category_test: skipped %d codepoints unassigned in Postgres due to Unicode version mismatch\n",
			   pg_skipped_codepoints);
	if (icu_skipped_codepoints > 0)
		printf("category_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n",
			   icu_skipped_codepoints);

	printf("category_test: ICU test: %d codepoints successful\n", successful);
}
#endif

int
main(int argc, char **argv)
{
	pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION);
	printf("category_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);

#ifdef USE_ICU
	icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
	printf("category_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);

	icu_test();
#else
	printf("category_test: ICU not available; skipping\n");
#endif
}