aboutsummaryrefslogtreecommitdiff
path: root/src/common/unicode/generate-unicode_category_table.pl
blob: a50c87b7e96752ac116078d3e95fc8938a9d4da6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/perl
#
# Generate a code point category table and its lookup utilities, using
# Unicode data files as input.
#
# Input: UnicodeData.txt
# Output: unicode_category_table.h
#
# Copyright (c) 2000-2024, PostgreSQL Global Development Group

use strict;
use warnings FATAL => 'all';
use Getopt::Long;

use FindBin;
use lib "$FindBin::RealBin/../../tools/";

my $CATEGORY_UNASSIGNED = 'Cn';

my $output_path = '.';

GetOptions('outdir:s' => \$output_path);

my $output_table_file = "$output_path/unicode_category_table.h";

my $FH;

# Read entries from UnicodeData.txt into a list of codepoint ranges
# and their general category.
my @category_ranges = ();
my $range_start = undef;
my $range_end = undef;
my $range_category = undef;

# If between a "<..., First>" entry and a "<..., Last>" entry, the gap in
# codepoints represents a range, and $gap_category is equal to the
# category for both (which must match). Otherwise, the gap represents
# unassigned code points.
my $gap_category = undef;

open($FH, '<', "$output_path/UnicodeData.txt")
  or die "Could not open $output_path/UnicodeData.txt: $!.";
while (my $line = <$FH>)
{
	my @elts = split(';', $line);
	my $code = hex($elts[0]);
	my $name = $elts[1];
	my $category = $elts[2];

	die "codepoint out of range" if $code > 0x10FFFF;
	die "unassigned codepoint in UnicodeData.txt" if $category eq $CATEGORY_UNASSIGNED;

	if (!defined($range_start)) {
		my $code_str = sprintf "0x%06x", $code;
		die if defined($range_end) || defined($range_category) || defined($gap_category);
		die "unexpected first entry <..., Last>" if ($name =~ /Last>/);
		die "expected 0x000000 for first entry, got $code_str" if $code != 0x000000;

		# initialize
		$range_start = $code;
		$range_end = $code;
		$range_category = $category;
		if ($name =~ /<.*, First>$/) {
			$gap_category = $category;
		} else {
			$gap_category = $CATEGORY_UNASSIGNED;
		}
		next;
	}

	# Gap in codepoints detected. If it's a different category than
	# the current range, emit the current range and initialize a new
	# range representing the gap.
	if ($range_end + 1 != $code && $range_category ne $gap_category) {
		if ($range_category ne $CATEGORY_UNASSIGNED) {
			push(@category_ranges, {start => $range_start, end => $range_end,
									category => $range_category});
		}
		$range_start = $range_end + 1;
		$range_end = $code - 1;
		$range_category = $gap_category;
	}

	# different category; new range
	if ($range_category ne $category) {
		if ($range_category ne $CATEGORY_UNASSIGNED) {
			push(@category_ranges, {start => $range_start, end => $range_end,
									category => $range_category});
		}
		$range_start = $code;
		$range_end = $code;
		$range_category = $category;
	}

	if ($name =~ /<.*, First>$/) {
		die "<..., First> entry unexpectedly follows another <..., First> entry"
		  if $gap_category ne $CATEGORY_UNASSIGNED;
		$gap_category = $category;
	}
	elsif ($name =~ /<.*, Last>$/) {
		die "<..., First> and <..., Last> entries have mismatching general category"
		  if $gap_category ne $category;
		$gap_category = $CATEGORY_UNASSIGNED;
	}
	else {
		die "unexpected entry found between <..., First> and <..., Last>"
		  if $gap_category ne $CATEGORY_UNASSIGNED;
	}

	$range_end = $code;
}
close $FH;

die "<..., First> entry with no corresponding <..., Last> entry"
  if $gap_category ne $CATEGORY_UNASSIGNED;

# emit final range
if ($range_category ne $CATEGORY_UNASSIGNED) {
	push(@category_ranges, {start => $range_start, end => $range_end,
							category => $range_category});
}

my $num_ranges = scalar @category_ranges;

# See: https://www.unicode.org/reports/tr44/#General_Category_Values
my $categories = {
	Cn => 'PG_U_UNASSIGNED',
	Lu => 'PG_U_UPPERCASE_LETTER',
	Ll => 'PG_U_LOWERCASE_LETTER',
	Lt => 'PG_U_TITLECASE_LETTER',
	Lm => 'PG_U_MODIFIER_LETTER',
	Lo => 'PG_U_OTHER_LETTER',
	Mn => 'PG_U_NONSPACING_MARK',
	Me => 'PG_U_ENCLOSING_MARK',
	Mc => 'PG_U_SPACING_MARK',
	Nd => 'PG_U_DECIMAL_NUMBER',
	Nl => 'PG_U_LETTER_NUMBER',
	No => 'PG_U_OTHER_NUMBER',
	Zs => 'PG_U_SPACE_SEPARATOR',
	Zl => 'PG_U_LINE_SEPARATOR',
	Zp => 'PG_U_PARAGRAPH_SEPARATOR',
	Cc => 'PG_U_CONTROL',
	Cf => 'PG_U_FORMAT',
	Co => 'PG_U_PRIVATE_USE',
	Cs => 'PG_U_SURROGATE',
	Pd => 'PG_U_DASH_PUNCTUATION',
	Ps => 'PG_U_OPEN_PUNCTUATION',
	Pe => 'PG_U_CLOSE_PUNCTUATION',
	Pc => 'PG_U_CONNECTOR_PUNCTUATION',
	Po => 'PG_U_OTHER_PUNCTUATION',
	Sm => 'PG_U_MATH_SYMBOL',
	Sc => 'PG_U_CURRENCY_SYMBOL',
	Sk => 'PG_U_MODIFIER_SYMBOL',
	So => 'PG_U_OTHER_SYMBOL',
	Pi => 'PG_U_INITIAL_PUNCTUATION',
	Pf => 'PG_U_FINAL_PUNCTUATION'
};

# Start writing out the output files
open my $OT, '>', $output_table_file
  or die "Could not open output file $output_table_file: $!\n";

print $OT <<HEADER;
/*-------------------------------------------------------------------------
 *
 * unicode_category_table.h
 *	  Category table for Unicode character classification.
 *
 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * src/include/common/unicode_category_table.h
 *
 *-------------------------------------------------------------------------
 */

#include "common/unicode_category.h"

/*
 * File auto-generated by src/common/unicode/generate-unicode_category_table.pl,
 * do not edit. There is deliberately not an #ifndef PG_UNICODE_CATEGORY_TABLE_H
 * here.
 */
typedef struct
{
	uint32		first;			/* Unicode codepoint */
	uint32		last;			/* Unicode codepoint */
	uint8		category;		/* General Category */
}			pg_category_range;

/* table of Unicode codepoint ranges and their categories */
static const pg_category_range unicode_categories[$num_ranges] =
{
HEADER

my $firsttime = 1;
foreach my $range (@category_ranges) {
	printf $OT ",\n" unless $firsttime;
	$firsttime = 0;

	my $category = $categories->{$range->{category}};
	die "category missing: $range->{category}" unless $category;
	printf $OT "\t{0x%06x, 0x%06x, %s}", $range->{start}, $range->{end}, $category;
}
print $OT "\n};\n";