aboutsummaryrefslogtreecommitdiff
path: root/src/common/unicode/generate-unicode_normprops_table.pl
blob: e8e5097c094b09cfb32458db278ad3c2ae9576b2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/perl
#
# Generate table of Unicode normalization "quick check" properties
# (see UAX #15).  Pass DerivedNormalizationProps.txt as argument.  The
# output is on stdout.
#
# Copyright (c) 2020, PostgreSQL Global Development Group

use strict;
use warnings;

my %data;

print
  "/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n";

print <<EOS;
#include "common/unicode_norm.h"

/*
 * We use a bit field here to save space.
 */
typedef struct
{
	unsigned int codepoint:21;
	signed int	quickcheck:4;	/* really UnicodeNormalizationQC */
}			pg_unicode_normprops;
EOS

foreach my $line (<ARGV>)
{
	chomp $line;
	$line =~ s/\s*#.*$//;
	next if $line eq '';
	my ($codepoint, $prop, $value) = split /\s*;\s*/, $line;
	next if $prop !~ /_QC/;

	my ($first, $last);
	if ($codepoint =~ /\.\./)
	{
		($first, $last) = split /\.\./, $codepoint;
	}
	else
	{
		$first = $last = $codepoint;
	}

	foreach my $cp (hex($first) .. hex($last))
	{
		$data{$prop}{$cp} = $value;
	}
}

# We create a separate array for each normalization form rather than,
# say, a two-dimensional array, because that array would be very
# sparse and would create unnecessary overhead especially for the NFC
# lookup.
foreach my $prop (sort keys %data)
{
	# Don't build the tables for the "D" forms because they are too
	# big.  See also unicode_is_normalized_quickcheck().
	next if $prop eq "NFD_QC" || $prop eq "NFKD_QC";

	print "\n";
	print
	  "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";

	my %subdata = %{ $data{$prop} };
	foreach my $cp (sort { $a <=> $b } keys %subdata)
	{
		my $qc;
		if ($subdata{$cp} eq 'N')
		{
			$qc = 'UNICODE_NORM_QC_NO';
		}
		elsif ($subdata{$cp} eq 'M')
		{
			$qc = 'UNICODE_NORM_QC_MAYBE';
		}
		else
		{
			die;
		}
		printf "\t{0x%04X, %s},\n", $cp, $qc;
	}

	print "};\n";
}