aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/mb/Unicode/convutils.pm
diff options
context:
space:
mode:
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>2016-11-30 14:54:02 +0200
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>2016-11-30 14:54:52 +0200
commit1de9cc0dcca649d1900720924f4ea5c430d1a51e (patch)
tree5815918e2c884c77b48ce75a715f628e0fd2777c /src/backend/utils/mb/Unicode/convutils.pm
parent6c303223be34329bae2f03a87590ffa0742a65f6 (diff)
downloadpostgresql-1de9cc0dcca649d1900720924f4ea5c430d1a51e.tar.gz
postgresql-1de9cc0dcca649d1900720924f4ea5c430d1a51e.zip
Rewrite the perl scripts to produce our Unicode conversion tables.
Generate EUC_CN mappings from gb-18030-2000.xml, because GB2312.TXT is no longer available. Get UHC from windows-949-2000.xml, it's more up-to-date. Plus tons more small changes. With these changes, the perl scripts faithfully produce the *.map files we have in the repository, from the external source files. In the passing, fix the Makefile to also download CP932.TXT and CP950.TXT. Based on patches by Kyotaro Horiguchi, reviewed by Daniel Gustafsson. Discussion: https://postgr.es/m/08e7892a-d55c-eefe-76e6-7910bc8dd1f3@iki.fi
Diffstat (limited to 'src/backend/utils/mb/Unicode/convutils.pm')
-rw-r--r--src/backend/utils/mb/Unicode/convutils.pm282
1 files changed, 282 insertions, 0 deletions
diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm
new file mode 100644
index 00000000000..d6a13e8c02c
--- /dev/null
+++ b/src/backend/utils/mb/Unicode/convutils.pm
@@ -0,0 +1,282 @@
+#
+# Copyright (c) 2001-2016, PostgreSQL Global Development Group
+#
+# src/backend/utils/mb/Unicode/convutils.pm
+
+use strict;
+
+#######################################################################
+# convert UCS-4 to UTF-8
+#
+sub ucs2utf
+{
+ my ($ucs) = @_;
+ my $utf;
+
+ if ($ucs <= 0x007f)
+ {
+ $utf = $ucs;
+ }
+ elsif ($ucs > 0x007f && $ucs <= 0x07ff)
+ {
+ $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
+ }
+ elsif ($ucs > 0x07ff && $ucs <= 0xffff)
+ {
+ $utf =
+ ((($ucs >> 12) | 0xe0) << 16) |
+ (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
+ }
+ else
+ {
+ $utf =
+ ((($ucs >> 18) | 0xf0) << 24) |
+ (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
+ (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
+ }
+ return ($utf);
+}
+
+#######################################################################
+# read_source - common routine to read source file
+#
+# fname ; input file name
+sub read_source
+{
+ my ($fname) = @_;
+ my @r;
+
+ open(my $in, '<', $fname) || die("cannot open $fname");
+
+ while (<$in>)
+ {
+ next if (/^#/);
+ chop;
+
+ next if (/^$/); # Ignore empty lines
+
+ next if (/^0x([0-9A-F]+)\s+(#.*)$/);
+
+ # Skip the first column for JIS0208.TXT
+ if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(?:0x([0-9A-Fa-f]+)\s+)?(#.*)$/)
+ {
+ print STDERR "READ ERROR at line $. in $fname: $_\n";
+ exit;
+ }
+ my $out = {f => $fname, l => $.,
+ code => hex($1),
+ ucs => hex($2),
+ comment => $4,
+ direction => "both"
+ };
+
+ # Ignore pure ASCII mappings. PostgreSQL character conversion code
+ # never even passes these to the conversion code.
+ next if ($out->{code} < 0x80 || $out->{ucs} < 0x80);
+
+ push(@r, $out);
+ }
+ close($in);
+
+ return \@r;
+}
+
+##################################################################
+# print_tables : output mapping tables
+#
+# Arguments:
+# charset - string name of the character set.
+# table - mapping table (see format below)
+# verbose - if 1, output comment on each line,
+# if 2, also output source file name and number
+#
+#
+#
+# Mapping table format:
+#
+# Mapping table is a list of hashes. Each hash has the following fields:
+# direction - Direction: 'both', 'from_unicode' or 'to_unicode'
+# ucs - Unicode code point
+# ucs_second - Second Unicode code point, if this is a "combined" character.
+# code - Byte sequence in the "other" character set, as an integer
+# comment - Text representation of the character
+# f - Source filename
+# l - Line number in source file
+#
+#
+sub print_tables
+{
+ my ($charset, $table, $verbose) = @_;
+
+ # Build an array with only the to-UTF8 direction mappings
+ my @to_unicode;
+ my @to_unicode_combined;
+ my @from_unicode;
+ my @from_unicode_combined;
+
+ foreach my $i (@$table)
+ {
+ if (defined $i->{ucs_second})
+ {
+ my $entry = {utf8 => ucs2utf($i->{ucs}),
+ utf8_second => ucs2utf($i->{ucs_second}),
+ code => $i->{code},
+ comment => $i->{comment},
+ f => $i->{f}, l => $i->{l}};
+ if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
+ {
+ push @to_unicode_combined, $entry;
+ }
+ if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
+ {
+ push @from_unicode_combined, $entry;
+ }
+ }
+ else
+ {
+ my $entry = {utf8 => ucs2utf($i->{ucs}),
+ code => $i->{code},
+ comment => $i->{comment},
+ f => $i->{f}, l => $i->{l}};
+ if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
+ {
+ push @to_unicode, $entry;
+ }
+ if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
+ {
+ push @from_unicode, $entry;
+ }
+ }
+ }
+
+ print_to_utf8_map($charset, \@to_unicode, $verbose);
+ print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0);
+ print_from_utf8_map($charset, \@from_unicode, $verbose);
+ print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0);
+}
+
+sub print_from_utf8_map
+{
+ my ($charset, $table, $verbose) = @_;
+
+ my $last_comment = "";
+
+ my $fname = lc("utf8_to_${charset}.map");
+ print "- Writing UTF8=>${charset} conversion table: $fname\n";
+ open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+ printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
+ "static const pg_utf_to_local ULmap${charset}[ %d ] = {",
+ scalar(@$table));
+ my $first = 1;
+ foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table)
+ {
+ print($out ",") if (!$first);
+ $first = 0;
+ print($out "\t/* $last_comment */") if ($verbose);
+
+ printf($out "\n {0x%04x, 0x%04x}", $$i{utf8}, $$i{code});
+ if ($verbose >= 2)
+ {
+ $last_comment = "$$i{f}:$$i{l} $$i{comment}";
+ }
+ else
+ {
+ $last_comment = $$i{comment};
+ }
+ }
+ print($out "\t/* $last_comment */") if ($verbose);
+ print $out "\n};\n";
+ close($out);
+}
+
+sub print_from_utf8_combined_map
+{
+ my ($charset, $table, $verbose) = @_;
+
+ my $last_comment = "";
+
+ my $fname = lc("utf8_to_${charset}_combined.map");
+ print "- Writing UTF8=>${charset} conversion table: $fname\n";
+ open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+ printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
+ "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
+ scalar(@$table));
+ my $first = 1;
+ foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table)
+ {
+ print($out ",") if (!$first);
+ $first = 0;
+ print($out "\t/* $last_comment */") if ($verbose);
+
+ printf($out "\n {0x%08x, 0x%08x, 0x%04x}", $$i{utf8}, $$i{utf8_second}, $$i{code});
+ $last_comment = "$$i{comment}";
+ }
+ print($out "\t/* $last_comment */") if ($verbose);
+ print $out "\n};\n";
+ close($out);
+}
+
+sub print_to_utf8_map
+{
+ my ($charset, $table, $verbose) = @_;
+
+ my $last_comment = "";
+
+ my $fname = lc("${charset}_to_utf8.map");
+
+ print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
+ open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+ printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
+ "static const pg_local_to_utf LUmap${charset}[ %d ] = {",
+ scalar(@$table));
+ my $first = 1;
+ foreach my $i (sort {$$a{code} <=> $$b{code}} @$table)
+ {
+ print($out ",") if (!$first);
+ $first = 0;
+ print($out "\t/* $last_comment */") if ($verbose);
+
+ printf($out "\n {0x%04x, 0x%x}", $$i{code}, $$i{utf8});
+ if ($verbose >= 2)
+ {
+ $last_comment = "$$i{f}:$$i{l} $$i{comment}";
+ }
+ else
+ {
+ $last_comment = $$i{comment};
+ }
+ }
+ print($out "\t/* $last_comment */") if ($verbose);
+ print $out "\n};\n";
+ close($out);
+}
+
+sub print_to_utf8_combined_map
+{
+ my ($charset, $table, $verbose) = @_;
+
+ my $last_comment = "";
+
+ my $fname = lc("${charset}_to_utf8_combined.map");
+
+ print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
+ open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+ printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
+ "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
+ scalar(@$table));
+ my $first = 1;
+ foreach my $i (sort {$$a{code} <=> $$b{code}} @$table)
+ {
+ print($out ",") if (!$first);
+ $first = 0;
+ print($out "\t/* $last_comment */") if ($verbose);
+
+ printf($out "\n {0x%04x, 0x%08x, 0x%08x}", $$i{code}, $$i{utf8}, $$i{utf8_second});
+ $last_comment = "$$i{comment}";
+ }
+ print($out "\t/* $last_comment */") if ($verbose);
+ print $out "\n};\n";
+ close($out);
+}
+
+1;