Rewrite the perl scripts to produce our Unicode conversion tables.

Generate EUC_CN mappings from gb-18030-2000.xml, because GB2312.TXT is no longer available. Get UHC from windows-949-2000.xml, it's more up-to-date. Plus tons more small changes. With these changes, the perl scripts faithfully produce the *.map files we have in the repository, from the external source files. In the passing, fix the Makefile to also download CP932.TXT and CP950.TXT. Based on patches by Kyotaro Horiguchi, reviewed by Daniel Gustafsson. Discussion: https://postgr.es/m/08e7892a-d55c-eefe-76e6-7910bc8dd1f3@iki.fi
author: Heikki Linnakangas <heikki.linnakangas@iki.fi> 2016-11-30 14:54:02 +0200
committer: Heikki Linnakangas <heikki.linnakangas@iki.fi> 2016-11-30 14:54:52 +0200
commit: 1de9cc0dcca649d1900720924f4ea5c430d1a51e (patch)
tree: 5815918e2c884c77b48ce75a715f628e0fd2777c /src/backend/utils/mb/Unicode/convutils.pm
parent: 6c303223be34329bae2f03a87590ffa0742a65f6 (diff)
download: postgresql-1de9cc0dcca649d1900720924f4ea5c430d1a51e.tar.gz
postgresql-1de9cc0dcca649d1900720924f4ea5c430d1a51e.zip
1 files changed, 282 insertions, 0 deletions
diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm
new file mode 100644
index 00000000000..d6a13e8c02c
--- /dev/null
+++ b/src/backend/utils/mb/Unicode/convutils.pm
@@ -0,0 +1,282 @@
+#
+# Copyright (c) 2001-2016, PostgreSQL Global Development Group
+#
+# src/backend/utils/mb/Unicode/convutils.pm
+
+use strict;
+
+#######################################################################
+# convert UCS-4 to UTF-8
+#
+sub ucs2utf
+{
+	my ($ucs) = @_;
+	my $utf;
+
+	if ($ucs <= 0x007f)
+	{
+		$utf = $ucs;
+	}
+	elsif ($ucs > 0x007f && $ucs <= 0x07ff)
+	{
+		$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
+	}
+	elsif ($ucs > 0x07ff && $ucs <= 0xffff)
+	{
+		$utf =
+		  ((($ucs >> 12) | 0xe0) << 16) |
+		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
+	}
+	else
+	{
+		$utf =
+		  ((($ucs >> 18) | 0xf0) << 24) |
+		  (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
+		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
+	}
+	return ($utf);
+}
+
+#######################################################################
+# read_source - common routine to read source file
+#
+# fname ; input file name
+sub read_source
+{
+	my ($fname) = @_;
+	my @r;
+
+	open(my $in, '<', $fname) || die("cannot open $fname");
+
+	while (<$in>)
+	{
+		next if (/^#/);
+		chop;
+
+		next if (/^$/); # Ignore empty lines
+
+		next if (/^0x([0-9A-F]+)\s+(#.*)$/);
+
+		# Skip the first column for JIS0208.TXT
+		if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(?:0x([0-9A-Fa-f]+)\s+)?(#.*)$/)
+		{
+			print STDERR "READ ERROR at line $. in $fname: $_\n";
+			exit;
+		}
+		my $out = {f => $fname, l => $.,
+				   code => hex($1),
+				   ucs => hex($2),
+				   comment => $4,
+				   direction => "both"
+				};
+
+		# Ignore pure ASCII mappings. PostgreSQL character conversion code
+		# never even passes these to the conversion code.
+		next if ($out->{code} < 0x80 || $out->{ucs} < 0x80);
+
+		push(@r, $out);
+	}
+	close($in);
+
+	return \@r;
+}
+
+##################################################################
+# print_tables : output mapping tables
+#
+# Arguments:
+#  charset - string name of the character set.
+#  table   - mapping table (see format below)
+#  verbose - if 1, output comment on each line,
+#            if 2, also output source file name and number
+#
+#
+#
+# Mapping table format:
+#
+# Mapping table is a list of hashes. Each hash has the following fields:
+#   direction  - Direction: 'both', 'from_unicode' or 'to_unicode'
+#   ucs        - Unicode code point
+#   ucs_second - Second Unicode code point, if this is a "combined" character.
+#   code       - Byte sequence in the "other" character set, as an integer
+#   comment    - Text representation of the character
+#   f          - Source filename
+#   l          - Line number in source file
+#
+#
+sub print_tables
+{
+	my ($charset, $table, $verbose) = @_;
+
+	# Build an array with only the to-UTF8 direction mappings
+	my @to_unicode;
+	my @to_unicode_combined;
+	my @from_unicode;
+	my @from_unicode_combined;
+
+	foreach my $i (@$table)
+	{
+		if (defined $i->{ucs_second})
+		{
+			my $entry = {utf8 => ucs2utf($i->{ucs}),
+						 utf8_second => ucs2utf($i->{ucs_second}),
+						 code => $i->{code},
+						 comment => $i->{comment},
+						 f => $i->{f}, l => $i->{l}};
+			if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
+			{
+				push @to_unicode_combined, $entry;
+			}
+			if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
+			{
+				push @from_unicode_combined, $entry;
+			}
+		}
+		else
+		{
+			my $entry = {utf8 => ucs2utf($i->{ucs}),
+						 code => $i->{code},
+						 comment => $i->{comment},
+						 f => $i->{f}, l => $i->{l}};
+			if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
+			{
+				push @to_unicode, $entry;
+			}
+			if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
+			{
+				push @from_unicode, $entry;
+			}
+		}
+	}
+
+	print_to_utf8_map($charset, \@to_unicode, $verbose);
+	print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0);
+	print_from_utf8_map($charset, \@from_unicode, $verbose);
+	print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0);
+}
+
+sub print_from_utf8_map
+{
+	my ($charset, $table, $verbose) = @_;
+
+	my $last_comment = "";
+
+	my $fname = lc("utf8_to_${charset}.map");
+	print "- Writing UTF8=>${charset} conversion table: $fname\n";
+	open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+	printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
+		   "static const pg_utf_to_local ULmap${charset}[ %d ] = {",
+		   scalar(@$table));
+	my $first = 1;
+	foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table)
+    {
+		print($out ",") if (!$first);
+		$first = 0;
+		print($out "\t/* $last_comment */") if ($verbose);
+
+		printf($out "\n  {0x%04x, 0x%04x}", $$i{utf8}, $$i{code});
+		if ($verbose >= 2)
+		{
+			$last_comment = "$$i{f}:$$i{l} $$i{comment}";
+		}
+		else
+		{
+			$last_comment = $$i{comment};
+		}
+	}
+	print($out "\t/* $last_comment */") if ($verbose);
+	print $out "\n};\n";
+	close($out);
+}
+
+sub print_from_utf8_combined_map
+{
+	my ($charset, $table, $verbose) = @_;
+
+	my $last_comment = "";
+
+	my $fname = lc("utf8_to_${charset}_combined.map");
+	print "- Writing UTF8=>${charset} conversion table: $fname\n";
+	open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+	printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
+		   "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
+		   scalar(@$table));
+	my $first = 1;
+	foreach my $i (sort {$$a{utf8} <=> $$b{utf8}} @$table)
+    {
+		print($out ",") if (!$first);
+		$first = 0;
+		print($out "\t/* $last_comment */") if ($verbose);
+
+		printf($out "\n  {0x%08x, 0x%08x, 0x%04x}", $$i{utf8}, $$i{utf8_second}, $$i{code});
+		$last_comment = "$$i{comment}";
+	}
+	print($out "\t/* $last_comment */") if ($verbose);
+	print $out "\n};\n";
+	close($out);
+}
+
+sub print_to_utf8_map
+{
+	my ($charset, $table, $verbose) = @_;
+
+	my $last_comment = "";
+
+	my $fname = lc("${charset}_to_utf8.map");
+
+	print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
+	open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+	printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
+		   "static const pg_local_to_utf LUmap${charset}[ %d ] = {",
+		   scalar(@$table));
+	my $first = 1;
+	foreach my $i (sort {$$a{code} <=> $$b{code}} @$table)
+    {
+		print($out ",") if (!$first);
+		$first = 0;
+		print($out "\t/* $last_comment */") if ($verbose);
+
+		printf($out "\n  {0x%04x, 0x%x}", $$i{code}, $$i{utf8});
+		if ($verbose >= 2)
+		{
+			$last_comment = "$$i{f}:$$i{l} $$i{comment}";
+		}
+		else
+		{
+			$last_comment = $$i{comment};
+		}
+	}
+	print($out "\t/* $last_comment */") if ($verbose);
+	print $out "\n};\n";
+	close($out);
+}
+
+sub print_to_utf8_combined_map
+{
+	my ($charset, $table, $verbose) = @_;
+
+	my $last_comment = "";
+
+	my $fname = lc("${charset}_to_utf8_combined.map");
+
+	print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
+	open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
+	printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
+		   "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
+		   scalar(@$table));
+	my $first = 1;
+	foreach my $i (sort {$$a{code} <=> $$b{code}} @$table)
+    {
+		print($out ",") if (!$first);
+		$first = 0;
+		print($out "\t/* $last_comment */") if ($verbose);
+
+		printf($out "\n  {0x%04x, 0x%08x, 0x%08x}", $$i{code}, $$i{utf8}, $$i{utf8_second});
+		$last_comment = "$$i{comment}";
+	}
+	print($out "\t/* $last_comment */") if ($verbose);
+	print $out "\n};\n";
+	close($out);
+}
+
+1;
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>	2016-11-30 14:54:02 +0200
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>	2016-11-30 14:54:52 +0200
commit	1de9cc0dcca649d1900720924f4ea5c430d1a51e (patch)
tree	5815918e2c884c77b48ce75a715f628e0fd2777c /src/backend/utils/mb/Unicode/convutils.pm
parent	6c303223be34329bae2f03a87590ffa0742a65f6 (diff)
download	postgresql-1de9cc0dcca649d1900720924f4ea5c430d1a51e.tar.gz postgresql-1de9cc0dcca649d1900720924f4ea5c430d1a51e.zip