aboutsummaryrefslogtreecommitdiff
path: root/src/common/unicode_case.c
blob: 26722e9a2d956469ae6e21aa41c927d4eeadf55d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
/*-------------------------------------------------------------------------
 * unicode_case.c
 *		Unicode case mapping and case conversion.
 *
 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  src/common/unicode_case.c
 *
 *-------------------------------------------------------------------------
 */
#ifndef FRONTEND
#include "postgres.h"
#else
#include "postgres_fe.h"
#endif

#include "common/unicode_case.h"
#include "common/unicode_case_table.h"
#include "common/unicode_category.h"
#include "mb/pg_wchar.h"

enum CaseMapResult
{
	CASEMAP_SELF,
	CASEMAP_SIMPLE,
	CASEMAP_SPECIAL,
};

/*
 * Map for each case kind.
 */
static const pg_wchar *const casekind_map[NCaseKind] =
{
	[CaseLower] = case_map_lower,
	[CaseTitle] = case_map_title,
	[CaseUpper] = case_map_upper,
	[CaseFold] = case_map_fold,
};

static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map);
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
						   CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
						   void *wbstate);
static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
								  const char *src, size_t srclen, size_t srcoff,
								  pg_wchar *u2, const pg_wchar **special);

pg_wchar
unicode_lowercase_simple(pg_wchar code)
{
	pg_wchar	cp = find_case_map(code, case_map_lower);

	return cp != 0 ? cp : code;
}

pg_wchar
unicode_titlecase_simple(pg_wchar code)
{
	pg_wchar	cp = find_case_map(code, case_map_title);

	return cp != 0 ? cp : code;
}

pg_wchar
unicode_uppercase_simple(pg_wchar code)
{
	pg_wchar	cp = find_case_map(code, case_map_upper);

	return cp != 0 ? cp : code;
}

pg_wchar
unicode_casefold_simple(pg_wchar code)
{
	pg_wchar	cp = find_case_map(code, case_map_fold);

	return cp != 0 ? cp : code;
}

/*
 * unicode_strlower()
 *
 * Convert src to lowercase, and return the result length (not including
 * terminating NUL).
 *
 * String src must be encoded in UTF-8. If srclen < 0, src must be
 * NUL-terminated.
 *
 * Result string is stored in dst, truncating if larger than dstsize. If
 * dstsize is greater than the result length, dst will be NUL-terminated;
 * otherwise not.
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 *
 * If full is true, use special case mappings if available and if the
 * conditions are satisfied.
 */
size_t
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
				 bool full)
{
	return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
						NULL);
}

/*
 * unicode_strtitle()
 *
 * Convert src to titlecase, and return the result length (not including
 * terminating NUL).
 *
 * String src must be encoded in UTF-8. If srclen < 0, src must be
 * NUL-terminated.
 *
 * Result string is stored in dst, truncating if larger than dstsize. If
 * dstsize is greater than the result length, dst will be NUL-terminated;
 * otherwise not.
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 *
 * If full is true, use special case mappings if available and if the
 * conditions are satisfied. Otherwise, use only simple mappings and use
 * uppercase instead of titlecase.
 *
 * Titlecasing requires knowledge about word boundaries, which is provided by
 * the callback wbnext. A word boundary is the offset of the start of a word
 * or the offset of the character immediately following a word.
 *
 * The caller is expected to initialize and free the callback state
 * wbstate. The callback should first return offset 0 for the first boundary;
 * then the offset of each subsequent word boundary; then the total length of
 * the string to indicate the final boundary.
 */
size_t
unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
				 bool full, WordBoundaryNext wbnext, void *wbstate)
{
	return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
						wbstate);
}

/*
 * unicode_strupper()
 *
 * Convert src to uppercase, and return the result length (not including
 * terminating NUL).
 *
 * String src must be encoded in UTF-8. If srclen < 0, src must be
 * NUL-terminated.
 *
 * Result string is stored in dst, truncating if larger than dstsize. If
 * dstsize is greater than the result length, dst will be NUL-terminated;
 * otherwise not.
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 *
 * If full is true, use special case mappings if available and if the
 * conditions are satisfied.
 */
size_t
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
				 bool full)
{
	return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
						NULL);
}

/*
 * unicode_strfold()
 *
 * Case fold src, and return the result length (not including terminating
 * NUL).
 *
 * String src must be encoded in UTF-8. If srclen < 0, src must be
 * NUL-terminated.
 *
 * Result string is stored in dst, truncating if larger than dstsize. If
 * dstsize is greater than the result length, dst will be NUL-terminated;
 * otherwise not.
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 */
size_t
unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
				bool full)
{
	return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
						NULL);
}

/*
 * Implement Unicode Default Case Conversion algorithm.
 *
 * If str_casekind is CaseLower or CaseUpper, map each character in the string
 * for which a mapping is available.
 *
 * If str_casekind is CaseTitle, maps characters found on a word boundary to
 * titlecase (or uppercase if full is false) and other characters to
 * lowercase. NB: does not currently implement the Unicode behavior in which
 * the word boundary is adjusted to the next Cased character. That behavior
 * could be implemented as an option, but it doesn't match the default
 * behavior of ICU, nor does it match the documented behavior of INITCAP().
 *
 * If full is true, use special mappings for relevant characters, which can
 * map a single codepoint to multiple codepoints, or depend on conditions.
 */
static size_t
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
			 CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
			 void *wbstate)
{
	/* character CaseKind varies while titlecasing */
	CaseKind	chr_casekind = str_casekind;
	size_t		srcoff = 0;
	size_t		result_len = 0;
	size_t		boundary = 0;

	Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
		   (str_casekind != CaseTitle && !wbnext && !wbstate));

	if (str_casekind == CaseTitle)
	{
		boundary = wbnext(wbstate);
		Assert(boundary == 0);	/* start of text is always a boundary */
	}

	while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
	{
		pg_wchar	u1 = utf8_to_unicode((unsigned char *) src + srcoff);
		int			u1len = unicode_utf8len(u1);
		pg_wchar	simple = 0;
		const pg_wchar *special = NULL;
		enum CaseMapResult casemap_result;

		if (str_casekind == CaseTitle)
		{
			if (srcoff == boundary)
			{
				chr_casekind = full ? CaseTitle : CaseUpper;
				boundary = wbnext(wbstate);
			}
			else
				chr_casekind = CaseLower;
		}

		casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
								 &simple, &special);

		switch (casemap_result)
		{
			case CASEMAP_SELF:
				/* no mapping; copy bytes from src */
				Assert(simple == 0);
				Assert(special == NULL);
				if (result_len + u1len <= dstsize)
					memcpy(dst + result_len, src + srcoff, u1len);

				result_len += u1len;
				break;
			case CASEMAP_SIMPLE:
				{
					/* replace with single character */
					pg_wchar	u2 = simple;
					pg_wchar	u2len = unicode_utf8len(u2);

					Assert(special == NULL);
					if (result_len + u2len <= dstsize)
						unicode_to_utf8(u2, (unsigned char *) dst + result_len);

					result_len += u2len;
				}
				break;
			case CASEMAP_SPECIAL:
				/* replace with up to MAX_CASE_EXPANSION characters */
				Assert(simple == 0);
				for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
				{
					pg_wchar	u2 = special[i];
					size_t		u2len = unicode_utf8len(u2);

					if (result_len + u2len <= dstsize)
						unicode_to_utf8(u2, (unsigned char *) dst + result_len);

					result_len += u2len;
				}
				break;
		}

		srcoff += u1len;
	}

	if (result_len < dstsize)
		dst[result_len] = '\0';

	return result_len;
}

/*
 * Check that the condition matches Final_Sigma, described in Unicode Table
 * 3-17. The character at the given offset must be directly preceded by a
 * Cased character, and must not be directly followed by a Cased character.
 *
 * Case_Ignorable characters are ignored. NB: some characters may be both
 * Cased and Case_Ignorable, in which case they are ignored.
 */
static bool
check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
	/* the start of the string is not preceded by a Cased character */
	if (offset == 0)
		return false;

	/* iterate backwards, looking for Cased character */
	for (int i = offset - 1; i >= 0; i--)
	{
		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
		{
			pg_wchar	curr = utf8_to_unicode(str + i);

			if (pg_u_prop_case_ignorable(curr))
				continue;
			else if (pg_u_prop_cased(curr))
				break;
			else
				return false;
		}
		else if ((str[i] & 0xC0) == 0x80)
			continue;

		Assert(false);			/* invalid UTF-8 */
	}

	/* end of string is not followed by a Cased character */
	if (offset == len)
		return true;

	/* iterate forwards, looking for Cased character */
	for (int i = offset + 1; i < len && str[i] != '\0'; i++)
	{
		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
		{
			pg_wchar	curr = utf8_to_unicode(str + i);

			if (pg_u_prop_case_ignorable(curr))
				continue;
			else if (pg_u_prop_cased(curr))
				return false;
			else
				break;
		}
		else if ((str[i] & 0xC0) == 0x80)
			continue;

		Assert(false);			/* invalid UTF-8 */
	}

	return true;
}

/*
 * Unicode allows for special casing to be applied only under certain
 * circumstances. The only currently-supported condition is Final_Sigma.
 */
static bool
check_special_conditions(int conditions, const char *str, size_t len,
						 size_t offset)
{
	if (conditions == 0)
		return true;
	else if (conditions == PG_U_FINAL_SIGMA)
		return check_final_sigma((unsigned char *) str, len, offset);

	/* no other conditions supported */
	Assert(false);
	return false;
}

/*
 * Map the given character to the requested case.
 *
 * If full is true, and a special case mapping is found and the conditions are
 * met, 'special' is set to the mapping result (which is an array of up to
 * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
 *
 * Otherwise, search for a simple mapping, and if found, set 'simple' to the
 * result and return CASEMAP_SIMPLE.
 *
 * If no mapping is found, return CASEMAP_SELF, and the caller should copy the
 * character without modification.
 */
static enum CaseMapResult
casemap(pg_wchar u1, CaseKind casekind, bool full,
		const char *src, size_t srclen, size_t srcoff,
		pg_wchar *simple, const pg_wchar **special)
{
	uint16		idx;

	/* Fast path for codepoints < 0x80 */
	if (u1 < 0x80)
	{
		/*
		 * The first elements in all tables are reserved as 0 (as NULL). The
		 * data starts at index 1, not 0.
		 */
		*simple = casekind_map[casekind][u1 + 1];

		return CASEMAP_SIMPLE;
	}

	idx = case_index(u1);

	if (idx == 0)
		return CASEMAP_SELF;

	if (full && case_map_special[idx] &&
		check_special_conditions(special_case[case_map_special[idx]].conditions,
								 src, srclen, srcoff))
	{
		*special = special_case[case_map_special[idx]].map[casekind];
		return CASEMAP_SPECIAL;
	}

	*simple = casekind_map[casekind][idx];

	return CASEMAP_SIMPLE;
}

/*
 * Find entry in simple case map.
 * If the entry does not exist, 0 will be returned.
 */
static pg_wchar
find_case_map(pg_wchar ucs, const pg_wchar *map)
{
	/* Fast path for codepoints < 0x80 */
	if (ucs < 0x80)
		/* The first elements in all tables are reserved as 0 (as NULL). */
		return map[ucs + 1];
	return map[case_index(ucs)];
}