src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71

#! /usr/bin/perl
#
# Copyright (c) 2001-2025, PostgreSQL Global Development Group
#
# src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl
#
# Generate UTF-8 <--> EUC_TW code conversion tables from
# map files provided by Unicode organization.
# Unfortunately it is prohibited by the organization
# to distribute the map files. So if you try to use this script,
# you have to obtain CNS11643.TXT from
# the organization's ftp site.
#
# CNS11643.TXT format:
#		 CNS11643 code in hex (3 bytes)
#		 (I guess the first byte means the plane No.)
#		 UCS-2 code in hex
#		 # and Unicode name (not used in this script)

use strict;
use warnings FATAL => 'all';

use convutils;

my $this_script = 'src/backend/utils/mb/Unicode/UCS_to_EUC_TW.pl';

my $mapping = &read_source("CNS11643.TXT");

my @extras;

foreach my $i (@$mapping)
{
	my $ucs = $i->{ucs};
	my $code = $i->{code};
	my $origcode = $i->{code};

	my $plane = ($code & 0x1f0000) >> 16;
	if ($plane > 16)
	{
		printf STDERR "Warning: invalid plane No.$plane. ignored\n";
		next;
	}

	if ($plane == 1)
	{
		$code = ($code & 0xffff) | 0x8080;
	}
	else
	{
		$code = (0x8ea00000 + ($plane << 16)) | (($code & 0xffff) | 0x8080);
	}
	$i->{code} = $code;

	# Some codes are mapped twice in the EUC_TW to UTF-8 table.
	if ($origcode >= 0x12121 && $origcode <= 0x20000)
	{
		push @extras,
		  {
			ucs => $i->{ucs},
			code => ($i->{code} + 0x8ea10000),
			rest => $i->{rest},
			direction => TO_UNICODE,
			f => $i->{f},
			l => $i->{l}
		  };
	}
}

push @$mapping, @extras;

print_conversion_tables($this_script, "EUC_TW", $mapping);