src/common/unicode/generate-unicode_case_table.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

#!/usr/bin/perl
#
# Generate Unicode character case mappings. Does not include tailoring
# or locale-specific mappings.
#
# Input: UnicodeData.txt
# Output: unicode_case_table.h
#
# Copyright (c) 2000-2024, PostgreSQL Global Development Group

use strict;
use warnings FATAL => 'all';
use Getopt::Long;

use FindBin;
use lib "$FindBin::RealBin/../../tools/";

my $output_path = '.';

GetOptions('outdir:s' => \$output_path);

my $output_table_file = "$output_path/unicode_case_table.h";

my $FH;

my %simple = ();

open($FH, '<', "$output_path/UnicodeData.txt")
  or die "Could not open $output_path/UnicodeData.txt: $!.";
while (my $line = <$FH>)
{
	my @elts = split(';', $line);
	my $code = hex($elts[0]);
	my $simple_uppercase = hex($elts[12] =~ s/^\s+|\s+$//rg);
	my $simple_lowercase = hex($elts[13] =~ s/^\s+|\s+$//rg);
	my $simple_titlecase = hex($elts[14] =~ s/^\s+|\s+$//rg);

	die "codepoint $code out of range" if $code > 0x10FFFF;
	die "Simple_Lowercase $code out of range" if $simple_lowercase > 0x10FFFF;
	die "Simple_Titlecase $code out of range" if $simple_titlecase > 0x10FFFF;
	die "Simple_Uppercase $code out of range" if $simple_uppercase > 0x10FFFF;

	if ($simple_lowercase || $simple_titlecase || $simple_uppercase)
	{
		$simple{$code} = {
			Simple_Lowercase => ($simple_lowercase || $code),
			Simple_Titlecase => ($simple_titlecase || $code),
			Simple_Uppercase => ($simple_uppercase || $code)
		};
	}
}
close $FH;

# Start writing out the output files
open my $OT, '>', $output_table_file
  or die "Could not open output file $output_table_file: $!\n";

# determine size of array given that codepoints <= 0x80 are dense and
# the rest of the entries are sparse
my $num_simple = 0x80;
foreach my $code (sort { $a <=> $b } (keys %simple))
{
	$num_simple++ unless $code < 0x80;
}

print $OT <<"EOS";
/*-------------------------------------------------------------------------
 *
 * unicode_case_table.h
 *	  Case mapping and information table.
 *
 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * src/include/common/unicode_case_table.h
 *
 *-------------------------------------------------------------------------
 */

/*
 * File auto-generated by src/common/unicode/generate-unicode_case_table.pl,
 * do not edit. There is deliberately not an #ifndef PG_UNICODE_CASE_TABLE_H
 * here.
 */

#include "common/unicode_case.h"
#include "mb/pg_wchar.h"

typedef enum
{
	CaseLower = 0,
	CaseTitle = 1,
	CaseUpper = 2,
	NCaseKind
}			CaseKind;

typedef struct
{
	pg_wchar	codepoint;		/* Unicode codepoint */
	pg_wchar	simplemap[NCaseKind];
}			pg_case_map;

/*
 * Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup),
 * sparse for higher codepoints (requiring scan or binary search).
 */
static const pg_case_map case_map[$num_simple] =
{
EOS

printf $OT "\t/* begin dense entries for codepoints < 0x80 */\n";
for (my $code = 0; $code < 0x80; $code++)
{
	my $lc = ($simple{$code}{Simple_Lowercase} || $code);
	my $tc = ($simple{$code}{Simple_Titlecase} || $code);
	my $uc = ($simple{$code}{Simple_Uppercase} || $code);
	printf $OT
	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n",
	  $code, $lc, $tc, $uc;
}
printf $OT "\n";

printf $OT "\t/* begin sparse entries for codepoints >= 0x80 */\n";
foreach my $code (sort { $a <=> $b } (keys %simple))
{
	next unless $code >= 0x80;    # already output above

	my $map = $simple{$code};
	printf $OT
	  "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n",
	  $code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
	  $map->{Simple_Uppercase};
}
print $OT "};\n";