Use radix tree for character encoding conversions.

Replace the mapping tables used to convert between UTF-8 and other character encodings with new radix tree-based maps. Looking up an entry in a radix tree is much faster than a binary search in the old maps. As a bonus, the radix tree representation is also more compact, making the binaries slightly smaller. The "combined" maps work the same as before, with binary search. They are much smaller than the main tables, so it doesn't matter so much. However, the "combined" maps are now stored in the same .map files as the main tables. This seems more clear, since they're always used together, and generated from the same source files. Patch by Kyotaro Horiguchi, with lot of hacking by me at various stages. Reviewed by Michael Paquier and Daniel Gustafsson. Discussion: https://www.postgresql.org/message-id/20170306.171609.204324917.horiguchi.kyotaro%40lab.ntt.co.jp
author: Heikki Linnakangas <heikki.linnakangas@iki.fi> 2017-03-13 20:46:39 +0200
committer: Heikki Linnakangas <heikki.linnakangas@iki.fi> 2017-03-13 20:46:39 +0200
commit: aeed17d00037950a16cc5ebad5b5592e5fa1ad0f (patch)
tree: 070aac060c2f923b5c636afbab51272bd2d04056 /src/backend/utils/mb/Unicode/convutils.pm
parent: 84892692fdedb753cfdd9a63b318b47ec640915f (diff)
download: postgresql-aeed17d00037950a16cc5ebad5b5592e5fa1ad0f.tar.gz
postgresql-aeed17d00037950a16cc5ebad5b5592e5fa1ad0f.zip
1 files changed, 656 insertions, 150 deletions
diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm
index eb3c602c32d..6bd84712b05 100644
--- a/src/backend/utils/mb/Unicode/convutils.pm
+++ b/src/backend/utils/mb/Unicode/convutils.pm
@@ -3,44 +3,27 @@
 #
 # src/backend/utils/mb/Unicode/convutils.pm
 
+package convutils;
+
 use strict;
 
-#######################################################################
-# convert UCS-4 to UTF-8
-#
-sub ucs2utf
-{
-	my ($ucs) = @_;
-	my $utf;
+use Exporter 'import';
 
-	if ($ucs <= 0x007f)
-	{
-		$utf = $ucs;
-	}
-	elsif ($ucs > 0x007f && $ucs <= 0x07ff)
-	{
-		$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
-	}
-	elsif ($ucs > 0x07ff && $ucs <= 0xffff)
-	{
-		$utf =
-		  ((($ucs >> 12) | 0xe0) << 16) |
-		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
-	}
-	else
-	{
-		$utf =
-		  ((($ucs >> 18) | 0xf0) << 24) |
-		  (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
-		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
-	}
-	return ($utf);
-}
+our @EXPORT = qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables);
+
+# Constants used in the 'direction' field of the character maps
+use constant {
+	NONE         => 0,
+	TO_UNICODE   => 1,
+	FROM_UNICODE => 2,
+	BOTH         => 3
+};
 
 #######################################################################
 # read_source - common routine to read source file
 #
 # fname ; input file name
+#
 sub read_source
 {
 	my ($fname) = @_;
@@ -70,7 +53,9 @@ sub read_source
 				   code => hex($1),
 				   ucs => hex($2),
 				   comment => $4,
-				   direction => "both"
+				   direction => BOTH,
+				   f => $fname,
+				   l => $.
 				};
 
 		# Ignore pure ASCII mappings. PostgreSQL character conversion code
@@ -85,20 +70,18 @@ sub read_source
 }
 
 ##################################################################
-# print_tables : output mapping tables
+# print_conversion_tables - output mapping tables
 #
-# Arguments:
-#  charset - string name of the character set.
-#  table   - mapping table (see format below)
-#  verbose - if 1, output comment on each line,
-#            if 2, also output source file name and number
+# print_conversion_tables($this_script, $csname, \%charset)
 #
+# this_script - the name of the *caller script* of this feature
+# csname      - character set name other than ucs
+# charset     - ref to character set array
 #
+# Input character set array format:
 #
-# Mapping table format:
-#
-# Mapping table is a list of hashes. Each hash has the following fields:
-#   direction  - Direction: 'both', 'from_unicode' or 'to_unicode'
+# Each element in the character set array is a hash. Each hash has the following fields:
+#   direction  - BOTH, TO_UNICODE, or FROM_UNICODE (or NONE, to ignore the entry altogether)
 #   ucs        - Unicode code point
 #   ucs_second - Second Unicode code point, if this is a "combined" character.
 #   code       - Byte sequence in the "other" character set, as an integer
@@ -106,180 +89,703 @@ sub read_source
 #   f          - Source filename
 #   l          - Line number in source file
 #
+sub print_conversion_tables
+{
+	my ($this_script, $csname, $charset) = @_;
+
+	print_conversion_tables_direction($this_script, $csname, FROM_UNICODE, $charset);
+	print_conversion_tables_direction($this_script, $csname, TO_UNICODE,   $charset);
+}
+
+#############################################################################
+# INTERNAL ROUTINES
+
+#######################################################################
+# print_conversion_tables_direction - write the whole content of C source of radix tree
+#
+# print_conversion_tables_direction($this_script, $csname, $direction, \%charset, $tblwidth)
+#
+# this_script - the name of the *caller script* of this feature
+# csname      - character set name other than ucs
+# direction   - desired direction, TO_UNICODE or FROM_UNICODE
+# charset     - ref to character set array
 #
-sub print_tables
+sub print_conversion_tables_direction
 {
-	my ($charset, $table, $verbose) = @_;
+	my ($this_script, $csname, $direction, $charset) = @_;
 
-	# Build an array with only the to-UTF8 direction mappings
-	my @to_unicode;
-	my @to_unicode_combined;
-	my @from_unicode;
-	my @from_unicode_combined;
+	my $fname;
+	my $tblname;
+	if ($direction == TO_UNICODE)
+	{
+		$fname = lc("${csname}_to_utf8.map");
+		$tblname = lc("${csname}_to_unicode_tree");
 
-	foreach my $i (@$table)
+		print "- Writing ${csname}=>UTF8 conversion table: $fname\n";
+	}
+	else
 	{
-		if (defined $i->{ucs_second})
+		$fname = lc("utf8_to_${csname}.map");
+		$tblname = lc("${csname}_from_unicode_tree");
+
+		print "- Writing UTF8=>${csname} conversion table: $fname\n";
+	}
+
+	open(my $out, '>', $fname) || die("cannot open $fname");
+
+	print $out "/* src/backend/utils/mb/Unicode/$fname */\n";
+	print $out "/* This file is generated by $this_script */\n\n";
+
+	# Collect regular, non-combined, mappings, and create the radix tree from them.
+	my $charmap = &make_charmap($out, $charset, $direction, 0);
+	print_radix_table($out, $tblname, $charmap);
+
+	# Collect combined characters, and create combined character table (if any)
+	my $charmap_combined = &make_charmap_combined($charset, $direction);
+
+	if (scalar @{$charmap_combined} > 0)
+	{
+		if ($direction == TO_UNICODE)
 		{
-			my $entry = {utf8 => ucs2utf($i->{ucs}),
-						 utf8_second => ucs2utf($i->{ucs_second}),
-						 code => $i->{code},
-						 comment => $i->{comment},
-						 f => $i->{f}, l => $i->{l}};
-			if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
-			{
-				push @to_unicode_combined, $entry;
-			}
-			if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
-			{
-				push @from_unicode_combined, $entry;
-			}
+			print_to_utf8_combined_map($out, $csname,
+									   $charmap_combined, 1);
 		}
 		else
 		{
-			my $entry = {utf8 => ucs2utf($i->{ucs}),
-						 code => $i->{code},
-						 comment => $i->{comment},
-						 f => $i->{f}, l => $i->{l}};
-			if ($i->{direction} eq "both" || $i->{direction} eq "to_unicode")
-			{
-				push @to_unicode, $entry;
-			}
-			if ($i->{direction} eq "both" || $i->{direction} eq "from_unicode")
-			{
-				push @from_unicode, $entry;
-			}
+			print_from_utf8_combined_map($out, $csname,
+										 $charmap_combined, 1);
 		}
 	}
 
-	print_to_utf8_map($charset, \@to_unicode, $verbose);
-	print_to_utf8_combined_map($charset, \@to_unicode_combined, $verbose) if (scalar @to_unicode_combined > 0);
-	print_from_utf8_map($charset, \@from_unicode, $verbose);
-	print_from_utf8_combined_map($charset, \@from_unicode_combined, $verbose) if (scalar @from_unicode_combined > 0);
+	close($out);
 }
 
-sub print_from_utf8_map
+sub print_from_utf8_combined_map
 {
-	my ($charset, $table, $verbose) = @_;
+	my ($out, $charset, $table, $verbose) = @_;
 
 	my $last_comment = "";
 
-	my $fname = lc("utf8_to_${charset}.map");
-	print "- Writing UTF8=>${charset} conversion table: $fname\n";
-	open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
-	printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
-		   "static const pg_utf_to_local ULmap${charset}[ %d ] = {",
-		   scalar(@$table));
+	printf $out "\n/* Combined character map */\n";
+	printf $out "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
+	  scalar(@$table);
 	my $first = 1;
 	foreach my $i (sort {$a->{utf8} <=> $b->{utf8}} @$table)
     {
 		print($out ",") if (!$first);
 		$first = 0;
-		print($out "\t/* $last_comment */") if ($verbose);
+		print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
 
-		printf($out "\n  {0x%04x, 0x%04x}", $i->{utf8}, $i->{code});
+		printf $out "\n  {0x%08x, 0x%08x, 0x%04x}",
+		  $i->{utf8}, $i->{utf8_second}, $i->{code};
 		if ($verbose >= 2)
 		{
-			$last_comment = "$i->{f}:$i->{l} $i->{comment}";
+			$last_comment =
+			  sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
 		}
-		else
+		elsif ($verbose >= 1)
 		{
 			$last_comment = $i->{comment};
 		}
 	}
-	print($out "\t/* $last_comment */") if ($verbose);
+	print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
 	print $out "\n};\n";
-	close($out);
 }
 
-sub print_from_utf8_combined_map
+sub print_to_utf8_combined_map
 {
-	my ($charset, $table, $verbose) = @_;
+	my ($out, $charset, $table, $verbose) = @_;
 
 	my $last_comment = "";
 
-	my $fname = lc("utf8_to_${charset}_combined.map");
-	print "- Writing UTF8=>${charset} conversion table: $fname\n";
-	open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
-	printf($out "/* src/backend/utils/mb/Unicode/$fname */\n\n".
-		   "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
-		   scalar(@$table));
+	printf $out "\n/* Combined character map */\n";
+	printf $out "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
+	  scalar(@$table);
+
 	my $first = 1;
-	foreach my $i (sort {$a->{utf8} <=> $b->{utf8}} @$table)
+	foreach my $i (sort {$a->{code} <=> $b->{code}} @$table)
     {
 		print($out ",") if (!$first);
 		$first = 0;
-		print($out "\t/* $last_comment */") if ($verbose);
+		print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
+
+		printf $out "\n  {0x%04x, 0x%08x, 0x%08x}",
+		  $i->{code}, $i->{utf8}, $i->{utf8_second};
 
-		printf($out "\n  {0x%08x, 0x%08x, 0x%04x}", $i->{utf8}, $i->{utf8_second}, $i->{code});
-		$last_comment = "$i->{comment}";
+		if ($verbose >= 2)
+		{
+			$last_comment =
+				sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
+		}
+		elsif ($verbose >= 1)
+		{
+			$last_comment = $i->{comment};
+		}
 	}
-	print($out "\t/* $last_comment */") if ($verbose);
+	print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
 	print $out "\n};\n";
-	close($out);
 }
 
-sub print_to_utf8_map
+#######################################################################
+# print_radix_table(<output handle>, <table name>, <charmap hash ref>)
+#
+# Input: A hash, mapping an input character to an output character.
+#
+# Constructs a radix tree from the hash, and prints it out as a C-struct.
+#
+sub print_radix_table
 {
-	my ($charset, $table, $verbose) = @_;
-
-	my $last_comment = "";
+	my ($out, $tblname, $c) = @_;
+
+	###
+	### Build radix trees in memory, for 1-, 2-, 3- and 4-byte inputs. Each
+	### radix tree is represented as a nested hash, each hash indexed by
+	### input byte
+	###
+	my %b1map;
+	my %b2map;
+	my %b3map;
+	my %b4map;
+	foreach my $in (keys %$c)
+	{
+		my $out = $c->{$in};
 
-	my $fname = lc("${charset}_to_utf8.map");
+		if ($in < 0x100)
+		{
+			$b1map{$in} = $out;
+		}
+		elsif ($in < 0x10000)
+		{
+			my $b1     = $in >> 8;
+			my $b2     = $in & 0xff;
 
-	print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
-	open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
-	printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
-		   "static const pg_local_to_utf LUmap${charset}[ %d ] = {",
-		   scalar(@$table));
-	my $first = 1;
-	foreach my $i (sort {$a->{code} <=> $b->{code}} @$table)
-    {
-		print($out ",") if (!$first);
-		$first = 0;
-		print($out "\t/* $last_comment */") if ($verbose);
+			$b2map{$b1}{$b2} = $out;
+		}
+		elsif ($in < 0x1000000)
+		{
+			my $b1     = $in >> 16;
+			my $b2     = ($in >> 8) & 0xff;
+			my $b3     = $in & 0xff;
 
-		printf($out "\n  {0x%04x, 0x%x}", $i->{code}, $i->{utf8});
-		if ($verbose >= 2)
+			$b3map{$b1}{$b2}{$b3} = $out;
+		}
+		elsif ($in < 0x100000000)
 		{
-			$last_comment = "$i->{f}:$i->{l} $i->{comment}";
+			my $b1     = $in >> 24;
+			my $b2     = ($in >> 16) & 0xff;
+			my $b3     = ($in >> 8) & 0xff;
+			my $b4     = $in & 0xff;
+
+			$b4map{$b1}{$b2}{$b3}{$b4} = $out;
 		}
 		else
 		{
-			$last_comment = $i->{comment};
+			die sprintf("up to 4 byte code is supported: %x", $in);
 		}
 	}
-	print($out "\t/* $last_comment */") if ($verbose);
-	print $out "\n};\n";
-	close($out);
+
+	my @segments;
+
+	###
+	### Build a linear list of "segments", from the nested hashes.
+	###
+	### Each segment is a lookup table, keyed by the next byte in the input.
+	### The segments are written out physically to one big array in the final
+	### step, but logically, they form a radix tree. Or rather, four radix
+	### trees: one for 1-byte inputs, another for 2-byte inputs, 3-byte
+	### inputs, and 4-byte inputs.
+	###
+	### Each segment is represented by a hash with following fields:
+	###
+	### comment => <string to output as a comment>
+	### label => <label that can be used to refer to this segment from elsewhere>
+	### values => <a hash, keyed by byte, 0-0xff>
+	###
+	### Entries in 'values' can be integers (for leaf-level segments), or
+	### string labels, pointing to a segment with that label. Any missing
+	### values are treated as zeros. If 'values' hash is missing altogether,
+	### it's treated as all-zeros.
+	###
+	### Subsequent steps will enrich the segments with more fields.
+	###
+
+	# Add the segments for the radix trees themselves.
+	push @segments, build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map);
+	push @segments, build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map);
+	push @segments, build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map);
+	push @segments, build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map);
+
+	###
+	### Find min and max index used in each level of each tree.
+	###
+	### These are stored separately, and we can then leave out the unused
+	### parts of every segment. (When using the resulting tree, you must
+	### check each input byte against the min and max.)
+	###
+	my %min_idx;
+	my %max_idx;
+	foreach my $seg (@segments)
+	{
+		my $this_min = $min_idx{$seg->{depth}}->{$seg->{level}};
+		my $this_max = $max_idx{$seg->{depth}}->{$seg->{level}};
+
+		foreach my $i (keys %{$seg->{values}})
+		{
+			$this_min = $i if (!defined $this_min || $i < $this_min);
+			$this_max = $i if (!defined $this_max || $i > $this_max);
+		}
+
+		$min_idx{$seg->{depth}}{$seg->{level}} = $this_min;
+		$max_idx{$seg->{depth}}{$seg->{level}} = $this_max;
+	}
+	# Copy the mins and max's back to every segment, for convenience.
+	foreach my $seg (@segments)
+	{
+		$seg->{min_idx} = $min_idx{$seg->{depth}}{$seg->{level}};
+		$seg->{max_idx} = $max_idx{$seg->{depth}}{$seg->{level}};
+	}
+
+	###
+	### Prepend a dummy all-zeros map to the beginning.
+	###
+	### A 0 is an invalid value anywhere in the table, and this allows us to
+	### point to 0 offset from any table, to get a 0 result.
+	###
+
+	# Find the max range between min and max indexes in any of the segments.
+	my $widest_range = 0;
+	foreach my $seg (@segments)
+	{
+		my $this_range = $seg->{max_idx} - $seg->{min_idx};
+		$widest_range = $this_range if ($this_range > $widest_range);
+	}
+
+	unshift @segments, {
+		header => "Dummy map, for invalid values",
+		min_idx => 0,
+		max_idx => $widest_range
+	};
+
+	###
+	### Eliminate overlapping zeros
+	###
+	### For each segment, if there are zero values at the end of, and there
+	### are also zero values at the beginning of the next segment, we can
+	### overlay the tail of this segment with the head of next segment, to
+	### save space.
+	###
+	### To achieve that, we subtract the 'max_idx' of each segment with the
+	### amount of zeros that can be overlaid.
+	###
+	for (my $j = 0; $j < $#segments - 1; $j++)
+	{
+		my $seg = $segments[$j];
+		my $nextseg = $segments[$j + 1];
+
+		# Count the number of zero values at the end of this segment.
+		my $this_trail_zeros = 0;
+		for (my $i = $seg->{max_idx}; $i >= $seg->{min_idx} && !$seg->{values}->{$i}; $i--)
+		{
+			$this_trail_zeros++;
+		}
+
+		# Count the number of zeros at the beginning of next segment.
+		my $next_lead_zeros = 0;
+		for (my $i = $nextseg->{min_idx}; $i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i}; $i++)
+		{
+			$next_lead_zeros++;
+		}
+
+		# How many zeros in common?
+		my $overlaid_trail_zeros =
+			($this_trail_zeros > $next_lead_zeros) ? $next_lead_zeros : $this_trail_zeros;
+
+		$seg->{overlaid_trail_zeros} = $overlaid_trail_zeros;
+		$seg->{max_idx} = $seg->{max_idx} - $overlaid_trail_zeros;
+	}
+
+	###
+	### Replace label references with real offsets.
+	###
+	### So far, the non-leaf segments have referred to other segments by
+	### their labels. Replace them with numerical offsets from the beginning
+	### of the final array. You cannot move, add, or remove segments after
+	### this step, as that would invalidate the offsets calculated here!
+	###
+	my $flatoff = 0;
+	my %segmap;
+
+	# First pass: assign offsets to each segment, and build hash
+	# of label => offset.
+	foreach my $seg (@segments)
+	{
+		$seg->{offset} = $flatoff;
+		$segmap{$seg->{label}} = $flatoff;
+		$flatoff += $seg->{max_idx} - $seg->{min_idx} + 1;
+	}
+	my $tblsize = $flatoff;
+
+	# Second pass: look up the offset of each label reference in the hash.
+	foreach my $seg (@segments)
+	{
+		while (my ($i, $val) = each %{$seg->{values}})
+		{
+			if (!($val =~ /^[0-9,.E]+$/ ))
+			{
+				my $segoff = $segmap{$val};
+				if ($segoff)
+				{
+					$seg->{values}->{$i} = $segoff;
+				}
+				else
+				{
+					die "no segment with label $val";
+				}
+			}
+		}
+	}
+
+	# Also look up the positions of the roots in the table.
+	my $b1root = $segmap{"1-byte"};
+	my $b2root = $segmap{"2-byte"};
+	my $b3root = $segmap{"3-byte"};
+	my $b4root = $segmap{"4-byte"};
+
+	# And the lower-upper values of each level in each radix tree.
+	my $b1_lower = $min_idx{1}{1};
+	my $b1_upper = $max_idx{1}{1};
+
+	my $b2_1_lower = $min_idx{2}{1};
+	my $b2_1_upper = $max_idx{2}{1};
+	my $b2_2_lower = $min_idx{2}{2};
+	my $b2_2_upper = $max_idx{2}{2};
+
+	my $b3_1_lower = $min_idx{3}{1};
+	my $b3_1_upper = $max_idx{3}{1};
+	my $b3_2_lower = $min_idx{3}{2};
+	my $b3_2_upper = $max_idx{3}{2};
+	my $b3_3_lower = $min_idx{3}{3};
+	my $b3_3_upper = $max_idx{3}{3};
+
+	my $b4_1_lower = $min_idx{4}{1};
+	my $b4_1_upper = $max_idx{4}{1};
+	my $b4_2_lower = $min_idx{4}{2};
+	my $b4_2_upper = $max_idx{4}{2};
+	my $b4_3_lower = $min_idx{4}{3};
+	my $b4_3_upper = $max_idx{4}{3};
+	my $b4_4_lower = $min_idx{4}{4};
+	my $b4_4_upper = $max_idx{4}{4};
+
+	###
+	### Find the maximum value in the whole table, to determine if we can
+	### use uint16 or if we need to use uint32.
+	###
+	my $max_val = 0;
+	foreach my $seg (@segments)
+	{
+		foreach my $val (values %{$seg->{values}})
+		{
+			$max_val = $val if ($val > $max_val);
+		}
+	}
+
+	my $datatype = ($max_val <= 0xffff) ? "uint16" : "uint32";
+
+	# For formatting, determine how many values we can fit on a single
+	# line, and how wide each value needs to be to align nicely.
+	my $vals_per_line;
+	my $colwidth;
+
+	if ($max_val <= 0xffff)
+	{
+		$vals_per_line = 8;
+		$colwidth = 4;
+	}
+	elsif ($max_val <= 0xffffff)
+	{
+		$vals_per_line = 4;
+		$colwidth = 6;
+	}
+	else
+	{
+		$vals_per_line = 4;
+		$colwidth = 8;
+	}
+
+	###
+	### Print the struct and array.
+	###
+	printf $out "static const $datatype ${tblname}_table[];\n";
+	printf $out "\n";
+	printf $out "static const pg_mb_radix_tree $tblname =\n";
+	printf $out "{\n";
+	if ($datatype eq "uint16")
+	{
+		print $out "  ${tblname}_table,\n";
+		print $out "  NULL, /* 32-bit table not used */\n";
+	}
+	if ($datatype eq "uint32")
+	{
+		print $out "  NULL, /* 16-bit table not used */\n";
+		print $out "  ${tblname}_table,\n";
+	}
+	printf $out "\n";
+	printf $out "  0x%04x, /* offset of table for 1-byte inputs */\n", $b1root;
+	printf $out "  0x%02x, /* b1_lower */\n", $b1_lower;
+	printf $out "  0x%02x, /* b1_upper */\n", $b1_upper;
+	printf $out "\n";
+	printf $out "  0x%04x, /* offset of table for 2-byte inputs */\n", $b2root;
+	printf $out "  0x%02x, /* b2_1_lower */\n", $b2_1_lower;
+	printf $out "  0x%02x, /* b2_1_upper */\n", $b2_1_upper;
+	printf $out "  0x%02x, /* b2_2_lower */\n", $b2_2_lower;
+	printf $out "  0x%02x, /* b2_2_upper */\n", $b2_2_upper;
+	printf $out "\n";
+	printf $out "  0x%04x, /* offset of table for 3-byte inputs */\n", $b3root;
+	printf $out "  0x%02x, /* b3_1_lower */\n", $b3_1_lower;
+	printf $out "  0x%02x, /* b3_1_upper */\n", $b3_1_upper;
+	printf $out "  0x%02x, /* b3_2_lower */\n", $b3_2_lower;
+	printf $out "  0x%02x, /* b3_2_upper */\n", $b3_2_upper;
+	printf $out "  0x%02x, /* b3_3_lower */\n", $b3_3_lower;
+	printf $out "  0x%02x, /* b3_3_upper */\n", $b3_3_upper;
+	printf $out "\n";
+	printf $out "  0x%04x, /* offset of table for 3-byte inputs */\n", $b4root;
+	printf $out "  0x%02x, /* b4_1_lower */\n", $b4_1_lower;
+	printf $out "  0x%02x, /* b4_1_upper */\n", $b4_1_upper;
+	printf $out "  0x%02x, /* b4_2_lower */\n", $b4_2_lower;
+	printf $out "  0x%02x, /* b4_2_upper */\n", $b4_2_upper;
+	printf $out "  0x%02x, /* b4_3_lower */\n", $b4_3_lower;
+	printf $out "  0x%02x, /* b4_3_upper */\n", $b4_3_upper;
+	printf $out "  0x%02x, /* b4_4_lower */\n", $b4_4_lower;
+	printf $out "  0x%02x  /* b4_4_upper */\n", $b4_4_upper;
+	print $out "};\n";
+	print $out "\n";
+	print $out "static const $datatype ${tblname}_table[$tblsize] =\n";
+	print $out "{";
+	my $off = 0;
+	foreach my $seg (@segments)
+	{
+		printf $out "\n";
+		printf $out "  /*** %s - offset 0x%05x ***/\n", $seg->{header}, $off;
+		printf $out "\n";
+
+		for (my $i=$seg->{min_idx}; $i <= $seg->{max_idx};)
+		{
+			# Print the next line's worth of values.
+			# XXX pad to begin at a nice boundary
+			printf $out "  /* %02x */ ", $i;
+			for (my $j = 0; $j < $vals_per_line && $i <= $seg->{max_idx}; $j++)
+			{
+				my $val = $seg->{values}->{$i};
+
+				printf $out " 0x%0*x", $colwidth, $val;
+				$off++;
+				if ($off != $tblsize)
+				{
+					print $out ",";
+				}
+				$i++;
+			}
+			print $out "\n";
+		}
+		if ($seg->{overlaid_trail_zeros})
+		{
+			printf $out "    /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n";
+		}
+	}
+
+	# Sanity check.
+	if ($off != $tblsize) { die "table size didn't match!"; }
+
+	print $out "};\n";
 }
 
-sub print_to_utf8_combined_map
+###
+sub build_segments_from_tree
 {
-	my ($charset, $table, $verbose) = @_;
+	my ($header, $rootlabel, $depth, $map) = @_;
 
-	my $last_comment = "";
+	my @segments;
 
-	my $fname = lc("${charset}_to_utf8_combined.map");
+	if (%{$map})
+	{
+		@segments = build_segments_recurse($header, $rootlabel, "", 1, $depth, $map);
 
-	print "- Writing ${charset}=>UTF8 conversion table: $fname\n";
-	open(my $out, '>', $fname) || die "cannot open output file : $fname\n";
-	printf($out "/* src/backend/utils/mb/Unicode/${fname} */\n\n".
-		   "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
-		   scalar(@$table));
-	my $first = 1;
-	foreach my $i (sort {$a->{code} <=> $b->{code}} @$table)
-    {
-		print($out ",") if (!$first);
-		$first = 0;
-		print($out "\t/* $last_comment */") if ($verbose);
+		# Sort the segments into "breadth-first" order. Not strictly required,
+		# but makes the maps nicer to read.
+		@segments = sort { $a->{level} cmp $b->{level} or
+						   $a->{path}  cmp $b->{path}}
+						 @segments;
+	}
+
+	return @segments;
+}
 
-		printf($out "\n  {0x%04x, 0x%08x, 0x%08x}", $i->{code}, $i->{utf8}, $i->{utf8_second});
-		$last_comment = "$i->{comment}";
+###
+sub build_segments_recurse
+{
+	my ($header, $label, $path, $level, $depth, $map) = @_;
+
+	my @segments;
+
+	if ($level == $depth)
+	{
+		push @segments, {
+			header => $header . ", leaf: ${path}xx",
+			label => $label,
+			level => $level,
+			depth => $depth,
+			path => $path,
+			values => $map
+		};
 	}
-	print($out "\t/* $last_comment */") if ($verbose);
-	print $out "\n};\n";
-	close($out);
+	else
+	{
+		my %children;
+
+		while (my ($i, $val) = each %$map)
+		{
+			my $childpath = $path . sprintf("%02x", $i);
+			my $childlabel = "$depth-level-$level-$childpath";
+
+			push @segments, build_segments_recurse($header, $childlabel, $childpath,
+												   $level + 1, $depth, $val);
+			$children{$i} = $childlabel;
+		}
+
+		push @segments, {
+			header => $header . ", byte #$level: ${path}xx",
+			label => $label,
+			level => $level,
+			depth => $depth,
+			path => $path,
+			values => \%children
+		};
+	}
+	return @segments;
+}
+
+#######################################################################
+# make_charmap - convert charset table to charmap hash
+#
+# make_charmap(\@charset, $direction)
+# charset     - ref to charset table : see print_conversion_tables
+# direction   - conversion direction
+#
+sub make_charmap
+{
+	my ($out, $charset, $direction, $verbose) = @_;
+
+	die "unacceptable direction : $direction"
+	  if ($direction != TO_UNICODE && $direction != FROM_UNICODE);
+
+	# In verbose mode, print a large comment with the source and comment of
+	# each character
+	if ($verbose)
+	{
+		print $out "/*\n";
+		print $out "<src>  <dst>    <file>:<lineno> <comment>\n";
+	}
+
+	my %charmap;
+	foreach my $c (@$charset)
+	{
+		# combined characters are handled elsewhere
+		next if (defined $c->{ucs_second});
+
+		next if ($c->{direction} != $direction && $c->{direction} != BOTH);
+
+		my ($src, $dst) =
+			$direction == TO_UNICODE
+			? ($c->{code}, ucs2utf($c->{ucs}))
+			: (ucs2utf($c->{ucs}), $c->{code});
+
+		# check for duplicate source codes
+		if (defined $charmap{$src})
+		{
+			printf STDERR
+				"Error: duplicate source code on %s:%d: 0x%04x => 0x%04x, 0x%04x\n",
+				$c->{f}, $c->{l}, $src, $charmap{$src}, $dst;
+			exit;
+		}
+		$charmap{$src} = $dst;
+
+		if ($verbose)
+		{
+			printf $out "0x%04x 0x%04x %s:%d %s\n", $src, $dst, $c->{f}, $c->{l}, $c->{comment};
+		}
+	}
+	if ($verbose)
+	{
+		print $out "*/\n\n";
+	}
+
+	return \%charmap;
+}
+
+#######################################################################
+# make_charmap_combined - convert charset table to charmap hash
+#     with checking duplicate source code
+#
+# make_charmap_combined(\@charset, $direction)
+# charset     - ref to charset table : see print_conversion_tables
+# direction   - conversion direction
+#
+sub make_charmap_combined
+{
+	my ($charset, $direction) = @_;
+
+	die "unacceptable direction : $direction"
+	  if ($direction != TO_UNICODE && $direction != FROM_UNICODE);
+
+	my @combined;
+	foreach my $c (@$charset)
+	{
+		next if ($c->{direction} != $direction && $c->{direction} != BOTH);
+
+		if (defined $c->{ucs_second})
+		{
+			my $entry = {utf8 => ucs2utf($c->{ucs}),
+						 utf8_second => ucs2utf($c->{ucs_second}),
+						 code => $c->{code},
+						 comment => $c->{comment},
+						 f => $c->{f}, l => $c->{l}};
+			push @combined, $entry;
+		}
+	}
+
+	return \@combined;
+}
+
+#######################################################################
+# convert UCS-4 to UTF-8
+#
+sub ucs2utf
+{
+	my ($ucs) = @_;
+	my $utf;
+
+	if ($ucs <= 0x007f)
+	{
+		$utf = $ucs;
+	}
+	elsif ($ucs > 0x007f && $ucs <= 0x07ff)
+	{
+		$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
+	}
+	elsif ($ucs > 0x07ff && $ucs <= 0xffff)
+	{
+		$utf =
+		  ((($ucs >> 12) | 0xe0) << 16) |
+		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
+	}
+	else
+	{
+		$utf =
+		  ((($ucs >> 18) | 0xf0) << 24) |
+		  (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
+		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
+	}
+	return ($utf);
 }
 
 1;
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>	2017-03-13 20:46:39 +0200
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>	2017-03-13 20:46:39 +0200
commit	aeed17d00037950a16cc5ebad5b5592e5fa1ad0f (patch)
tree	070aac060c2f923b5c636afbab51272bd2d04056 /src/backend/utils/mb/Unicode/convutils.pm
parent	84892692fdedb753cfdd9a63b318b47ec640915f (diff)
download	postgresql-aeed17d00037950a16cc5ebad5b5592e5fa1ad0f.tar.gz postgresql-aeed17d00037950a16cc5ebad5b5592e5fa1ad0f.zip