#!/usr/bin/perl # # Generate Unicode character case mappings. Does not include tailoring # or locale-specific mappings. # # Input: UnicodeData.txt # Output: unicode_case_table.h # # Copyright (c) 2000-2024, PostgreSQL Global Development Group use strict; use warnings FATAL => 'all'; use Getopt::Long; use FindBin; use lib "$FindBin::RealBin/../../tools/"; my $output_path = '.'; GetOptions('outdir:s' => \$output_path); my $output_table_file = "$output_path/unicode_case_table.h"; my $FH; my %simple = (); open($FH, '<', "$output_path/UnicodeData.txt") or die "Could not open $output_path/UnicodeData.txt: $!."; while (my $line = <$FH>) { my @elts = split(';', $line); my $code = hex($elts[0]); my $simple_uppercase = hex($elts[12] =~ s/^\s+|\s+$//rg); my $simple_lowercase = hex($elts[13] =~ s/^\s+|\s+$//rg); my $simple_titlecase = hex($elts[14] =~ s/^\s+|\s+$//rg); die "codepoint $code out of range" if $code > 0x10FFFF; die "Simple_Lowercase $code out of range" if $simple_lowercase > 0x10FFFF; die "Simple_Titlecase $code out of range" if $simple_titlecase > 0x10FFFF; die "Simple_Uppercase $code out of range" if $simple_uppercase > 0x10FFFF; if ($simple_lowercase || $simple_titlecase || $simple_uppercase) { $simple{$code} = { Simple_Lowercase => ($simple_lowercase || $code), Simple_Titlecase => ($simple_titlecase || $code), Simple_Uppercase => ($simple_uppercase || $code) }; } } close $FH; # Start writing out the output files open my $OT, '>', $output_table_file or die "Could not open output file $output_table_file: $!\n"; # determine size of array given that codepoints <= 0x80 are dense and # the rest of the entries are sparse my $num_simple = 0x80; foreach my $code (sort { $a <=> $b } (keys %simple)) { $num_simple++ unless $code < 0x80; } print $OT <<"EOS"; /*------------------------------------------------------------------------- * * unicode_case_table.h * Case mapping and information table. * * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/include/common/unicode_case_table.h * *------------------------------------------------------------------------- */ /* * File auto-generated by src/common/unicode/generate-unicode_case_table.pl, * do not edit. There is deliberately not an #ifndef PG_UNICODE_CASE_TABLE_H * here. */ #include "common/unicode_case.h" #include "mb/pg_wchar.h" typedef enum { CaseLower = 0, CaseTitle = 1, CaseUpper = 2, NCaseKind } CaseKind; typedef struct { pg_wchar codepoint; /* Unicode codepoint */ pg_wchar simplemap[NCaseKind]; } pg_case_map; /* * Case mapping table. Dense for codepoints < 0x80 (enabling fast lookup), * sparse for higher codepoints (requiring scan or binary search). */ static const pg_case_map case_map[$num_simple] = { EOS printf $OT "\t/* begin dense entries for codepoints < 0x80 */\n"; for (my $code = 0; $code < 0x80; $code++) { my $lc = ($simple{$code}{Simple_Lowercase} || $code); my $tc = ($simple{$code}{Simple_Titlecase} || $code); my $uc = ($simple{$code}{Simple_Uppercase} || $code); printf $OT "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n", $code, $lc, $tc, $uc; } printf $OT "\n"; printf $OT "\t/* begin sparse entries for codepoints >= 0x80 */\n"; foreach my $code (sort { $a <=> $b } (keys %simple)) { next unless $code >= 0x80; # already output above my $map = $simple{$code}; printf $OT "\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}},\n", $code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase}, $map->{Simple_Uppercase}; } print $OT "};\n";