aboutsummaryrefslogtreecommitdiff
path: root/src/common/unicode/generate-unicode_combining_table.pl
blob: e468a5f8c99222c9dcaf4204c566d5fc323fdc86 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/perl
#
# Generate sorted list of non-overlapping intervals of non-spacing
# characters, using Unicode data files as input.  Pass UnicodeData.txt
# as argument.  The output is on stdout.
#
# Copyright (c) 2019, PostgreSQL Global Development Group

use strict;
use warnings;

my $range_start = undef;
my $codepoint;
my $prev_codepoint;
my $count = 0;

print "/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */\n\n";

print "static const struct mbinterval combining[] = {\n";

foreach my $line (<ARGV>)
{
    chomp $line;
    my @fields = split ';', $line;
    $codepoint = hex $fields[0];

    next if $codepoint > 0xFFFF;

    if ($fields[2] eq 'Me' || $fields[2] eq 'Mn')
    {
        # combining character, save for start of range
        if (!defined($range_start))
        {
            $range_start = $codepoint;
        }
    }
    else
    {
        # not a combining character, print out previous range if any
        if (defined($range_start))
        {
            printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_codepoint;
            $range_start = undef;
        }
    }
}
continue
{
    $prev_codepoint = $codepoint;
}

print "};\n";