aboutsummaryrefslogtreecommitdiff
path: root/src/common/unicode
diff options
context:
space:
mode:
Diffstat (limited to 'src/common/unicode')
-rw-r--r--src/common/unicode/Makefile19
-rw-r--r--src/common/unicode/category_test.c108
-rw-r--r--src/common/unicode/generate-unicode_category_table.pl204
-rw-r--r--src/common/unicode/generate-unicode_version.pl46
-rw-r--r--src/common/unicode/meson.build40
-rw-r--r--src/common/unicode/norm_test.c2
6 files changed, 416 insertions, 3 deletions
diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile
index 382da476cf9..27a7d5a807e 100644
--- a/src/common/unicode/Makefile
+++ b/src/common/unicode/Makefile
@@ -15,11 +15,15 @@ include $(top_builddir)/src/Makefile.global
override CPPFLAGS := -DFRONTEND -I. $(CPPFLAGS)
LIBS += $(PTHREAD_LIBS)
+LDFLAGS_INTERNAL += $(ICU_LIBS)
+CPPFLAGS += $(ICU_CFLAGS)
+
# By default, do nothing.
all:
-update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
+update-unicode: unicode_category_table.h unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h unicode_version.h
mv $^ $(top_srcdir)/src/include/common/
+ $(MAKE) category-check
$(MAKE) normalization-check
# These files are part of the Unicode Character Database. Download
@@ -28,6 +32,12 @@ update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asi
UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
+unicode_version.h: generate-unicode_version.pl
+ $(PERL) $< --version $(UNICODE_VERSION)
+
+unicode_category_table.h: generate-unicode_category_table.pl UnicodeData.txt
+ $(PERL) $<
+
# Generation of conversion tables used for string normalization with
# UTF-8 strings.
unicode_norm_hashfunc.h: unicode_norm_table.h
@@ -45,9 +55,14 @@ unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizat
$(PERL) $^ >$@
# Test suite
+category-check: category_test
+ ./category_test
+
normalization-check: norm_test
./norm_test
+category_test: category_test.o ../unicode_category.o | submake-common
+
norm_test: norm_test.o ../unicode_norm.o | submake-common
norm_test.o: norm_test_table.h
@@ -64,7 +79,7 @@ norm_test_table.h: generate-norm_test_table.pl NormalizationTest.txt
clean:
- rm -f $(OBJS) norm_test norm_test.o
+ rm -f $(OBJS) category_test category_test.o norm_test norm_test.o
distclean: clean
rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
diff --git a/src/common/unicode/category_test.c b/src/common/unicode/category_test.c
new file mode 100644
index 00000000000..ba62716d456
--- /dev/null
+++ b/src/common/unicode/category_test.c
@@ -0,0 +1,108 @@
+/*-------------------------------------------------------------------------
+ * category_test.c
+ * Program to test Unicode general category functions.
+ *
+ * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/common/unicode/category_test.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres_fe.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef USE_ICU
+#include <unicode/uchar.h>
+#endif
+#include "common/unicode_category.h"
+#include "common/unicode_version.h"
+
+/*
+ * Parse version into integer for easy comparison.
+ */
+#ifdef USE_ICU
+static int
+parse_unicode_version(const char *version)
+{
+ int n,
+ major,
+ minor;
+
+ n = sscanf(version, "%d.%d", &major, &minor);
+
+ Assert(n == 2);
+ Assert(minor < 100);
+
+ return major * 100 + minor;
+}
+#endif
+
+/*
+ * Exhaustively test that the Unicode category for each codepoint matches that
+ * returned by ICU.
+ */
+int
+main(int argc, char **argv)
+{
+#ifdef USE_ICU
+ int pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION);
+ int icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
+ int pg_skipped_codepoints = 0;
+ int icu_skipped_codepoints = 0;
+
+ printf("Postgres Unicode Version:\t%s\n", PG_UNICODE_VERSION);
+ printf("ICU Unicode Version:\t\t%s\n", U_UNICODE_VERSION);
+
+ for (UChar32 code = 0; code <= 0x10ffff; code++)
+ {
+ uint8_t pg_category = unicode_category(code);
+ uint8_t icu_category = u_charType(code);
+
+ if (pg_category != icu_category)
+ {
+ /*
+ * A version mismatch means that some assigned codepoints in the
+ * newer version may be unassigned in the older version. That's
+ * OK, though the test will not cover those codepoints marked
+ * unassigned in the older version (that is, it will no longer be
+ * an exhaustive test).
+ */
+ if (pg_category == PG_U_UNASSIGNED &&
+ pg_unicode_version < icu_unicode_version)
+ pg_skipped_codepoints++;
+ else if (icu_category == PG_U_UNASSIGNED &&
+ icu_unicode_version < pg_unicode_version)
+ icu_skipped_codepoints++;
+ else
+ {
+ printf("FAILURE for codepoint %06x\n", code);
+ printf("Postgres category: %02d %s %s\n", pg_category,
+ unicode_category_abbrev(pg_category),
+ unicode_category_string(pg_category));
+ printf("ICU category: %02d %s %s\n", icu_category,
+ unicode_category_abbrev(icu_category),
+ unicode_category_string(icu_category));
+ printf("\n");
+ exit(1);
+ }
+ }
+ }
+
+ if (pg_skipped_codepoints > 0)
+ printf("Skipped %d codepoints unassigned in Postgres due to Unicode version mismatch.\n",
+ pg_skipped_codepoints);
+ if (icu_skipped_codepoints > 0)
+ printf("Skipped %d codepoints unassigned in ICU due to Unicode version mismatch.\n",
+ icu_skipped_codepoints);
+
+ printf("category_test: All tests successful!\n");
+ exit(0);
+#else
+ printf("ICU support required for test; skipping.\n");
+ exit(0);
+#endif
+}
diff --git a/src/common/unicode/generate-unicode_category_table.pl b/src/common/unicode/generate-unicode_category_table.pl
new file mode 100644
index 00000000000..8f03425e0bf
--- /dev/null
+++ b/src/common/unicode/generate-unicode_category_table.pl
@@ -0,0 +1,204 @@
+#!/usr/bin/perl
+#
+# Generate a code point category table and its lookup utilities, using
+# Unicode data files as input.
+#
+# Input: UnicodeData.txt
+# Output: unicode_category_table.h
+#
+# Copyright (c) 2000-2023, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+use FindBin;
+use lib "$FindBin::RealBin/../../tools/";
+
+my $CATEGORY_UNASSIGNED = 'Cn';
+
+my $output_path = '.';
+
+GetOptions('outdir:s' => \$output_path);
+
+my $output_table_file = "$output_path/unicode_category_table.h";
+
+my $FH;
+
+# Read entries from UnicodeData.txt into a list of codepoint ranges
+# and their general category.
+my @category_ranges = ();
+my $range_start = undef;
+my $range_end = undef;
+my $range_category = undef;
+
+# If between a "<..., First>" entry and a "<..., Last>" entry, the gap in
+# codepoints represents a range, and $gap_category is equal to the
+# category for both (which must match). Otherwise, the gap represents
+# unassigned code points.
+my $gap_category = undef;
+
+open($FH, '<', "$output_path/UnicodeData.txt")
+ or die "Could not open $output_path/UnicodeData.txt: $!.";
+while (my $line = <$FH>)
+{
+ my @elts = split(';', $line);
+ my $code = hex($elts[0]);
+ my $name = $elts[1];
+ my $category = $elts[2];
+
+ die "codepoint out of range" if $code > 0x10FFFF;
+ die "unassigned codepoint in UnicodeData.txt" if $category eq $CATEGORY_UNASSIGNED;
+
+ if (!defined($range_start)) {
+ my $code_str = sprintf "0x%06x", $code;
+ die if defined($range_end) || defined($range_category) || defined($gap_category);
+ die "unexpected first entry <..., Last>" if ($name =~ /Last>/);
+ die "expected 0x000000 for first entry, got $code_str" if $code != 0x000000;
+
+ # initialize
+ $range_start = $code;
+ $range_end = $code;
+ $range_category = $category;
+ if ($name =~ /<.*, First>$/) {
+ $gap_category = $category;
+ } else {
+ $gap_category = $CATEGORY_UNASSIGNED;
+ }
+ next;
+ }
+
+ # Gap in codepoints detected. If it's a different category than
+ # the current range, emit the current range and initialize a new
+ # range representing the gap.
+ if ($range_end + 1 != $code && $range_category ne $gap_category) {
+ push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category});
+ $range_start = $range_end + 1;
+ $range_end = $code - 1;
+ $range_category = $gap_category;
+ }
+
+ # different category; new range
+ if ($range_category ne $category) {
+ push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category});
+ $range_start = $code;
+ $range_end = $code;
+ $range_category = $category;
+ }
+
+ if ($name =~ /<.*, First>$/) {
+ die "<..., First> entry unexpectedly follows another <..., First> entry"
+ if $gap_category ne $CATEGORY_UNASSIGNED;
+ $gap_category = $category;
+ }
+ elsif ($name =~ /<.*, Last>$/) {
+ die "<..., First> and <..., Last> entries have mismatching general category"
+ if $gap_category ne $category;
+ $gap_category = $CATEGORY_UNASSIGNED;
+ }
+ else {
+ die "unexpected entry found between <..., First> and <..., Last>"
+ if $gap_category ne $CATEGORY_UNASSIGNED;
+ }
+
+ $range_end = $code;
+}
+close $FH;
+
+die "<..., First> entry with no corresponding <..., Last> entry"
+ if $gap_category ne $CATEGORY_UNASSIGNED;
+
+# emit final range
+push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category});
+
+# emit range for any unassigned code points after last entry
+if ($range_end < 0x10FFFF) {
+ $range_start = $range_end + 1;
+ $range_end = 0x10FFFF;
+ $range_category = $CATEGORY_UNASSIGNED;
+ push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category});
+}
+
+my $num_ranges = scalar @category_ranges;
+
+# See: https://www.unicode.org/reports/tr44/#General_Category_Values
+my $categories = {
+ Cn => 'PG_U_UNASSIGNED',
+ Lu => 'PG_U_UPPERCASE_LETTER',
+ Ll => 'PG_U_LOWERCASE_LETTER',
+ Lt => 'PG_U_TITLECASE_LETTER',
+ Lm => 'PG_U_MODIFIER_LETTER',
+ Lo => 'PG_U_OTHER_LETTER',
+ Mn => 'PG_U_NONSPACING_MARK',
+ Me => 'PG_U_ENCLOSING_MARK',
+ Mc => 'PG_U_SPACING_MARK',
+ Nd => 'PG_U_DECIMAL_NUMBER',
+ Nl => 'PG_U_LETTER_NUMBER',
+ No => 'PG_U_OTHER_NUMBER',
+ Zs => 'PG_U_SPACE_SEPARATOR',
+ Zl => 'PG_U_LINE_SEPARATOR',
+ Zp => 'PG_U_PARAGRAPH_SEPARATOR',
+ Cc => 'PG_U_CONTROL',
+ Cf => 'PG_U_FORMAT',
+ Co => 'PG_U_PRIVATE_USE',
+ Cs => 'PG_U_SURROGATE',
+ Pd => 'PG_U_DASH_PUNCTUATION',
+ Ps => 'PG_U_OPEN_PUNCTUATION',
+ Pe => 'PG_U_CLOSE_PUNCTUATION',
+ Pc => 'PG_U_CONNECTOR_PUNCTUATION',
+ Po => 'PG_U_OTHER_PUNCTUATION',
+ Sm => 'PG_U_MATH_SYMBOL',
+ Sc => 'PG_U_CURRENCY_SYMBOL',
+ Sk => 'PG_U_MODIFIER_SYMBOL',
+ So => 'PG_U_OTHER_SYMBOL',
+ Pi => 'PG_U_INITIAL_PUNCTUATION',
+ Pf => 'PG_U_FINAL_PUNCTUATION'
+};
+
+# Start writing out the output files
+open my $OT, '>', $output_table_file
+ or die "Could not open output file $output_table_file: $!\n";
+
+print $OT <<HEADER;
+/*-------------------------------------------------------------------------
+ *
+ * unicode_category_table.h
+ * Category table for Unicode character classification.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/common/unicode_category_table.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "common/unicode_category.h"
+
+/*
+ * File auto-generated by src/common/unicode/generate-unicode_category_table.pl,
+ * do not edit. There is deliberately not an #ifndef PG_UNICODE_CATEGORY_TABLE_H
+ * here.
+ */
+typedef struct
+{
+ uint32 first; /* Unicode codepoint */
+ uint32 last; /* Unicode codepoint */
+ uint8 category; /* General Category */
+} pg_category_range;
+
+/* table of Unicode codepoint ranges and their categories */
+static const pg_category_range unicode_categories[$num_ranges] =
+{
+HEADER
+
+my $firsttime = 1;
+foreach my $range (@category_ranges) {
+ printf $OT ",\n" unless $firsttime;
+ $firsttime = 0;
+
+ my $category = $categories->{$range->{category}};
+ die "category missing: $range->{category}" unless $category;
+ printf $OT "\t{0x%06x, 0x%06x, %s}", $range->{start}, $range->{end}, $category;
+}
+print $OT "\n};\n";
diff --git a/src/common/unicode/generate-unicode_version.pl b/src/common/unicode/generate-unicode_version.pl
new file mode 100644
index 00000000000..22eb2f1a3dd
--- /dev/null
+++ b/src/common/unicode/generate-unicode_version.pl
@@ -0,0 +1,46 @@
+#!/usr/bin/perl
+#
+# Generate header file with Unicode version used by Postgres.
+#
+# Output: unicode_version.h
+#
+# Copyright (c) 2000-2023, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+use FindBin;
+use lib "$FindBin::RealBin/../../tools/";
+
+my $output_path = '.';
+my $version_str = undef;
+
+GetOptions('outdir:s' => \$output_path, 'version:s' => \$version_str);
+
+my @version_parts = split /\./, $version_str;
+
+my $unicode_version_str = sprintf "%d.%d", $version_parts[0], $version_parts[1];
+
+my $output_file = "$output_path/unicode_version.h";
+
+# Start writing out the output files
+open my $OT, '>', $output_file
+ or die "Could not open output file $output_file: $!\n";
+
+print $OT <<HEADER;
+/*-------------------------------------------------------------------------
+ *
+ * unicode_version.h
+ * Unicode version used by Postgres.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/common/unicode_version.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#define PG_UNICODE_VERSION "$unicode_version_str"
+HEADER
diff --git a/src/common/unicode/meson.build b/src/common/unicode/meson.build
index 357ca2f9fb3..6af46122c4e 100644
--- a/src/common/unicode/meson.build
+++ b/src/common/unicode/meson.build
@@ -25,6 +25,25 @@ endforeach
update_unicode_targets = []
update_unicode_targets += \
+ custom_target('unicode_version.h',
+ output: ['unicode_version.h'],
+ command: [
+ perl, files('generate-unicode_version.pl'),
+ '--outdir', '@OUTDIR@', '--version', UNICODE_VERSION],
+ build_by_default: false,
+ )
+
+update_unicode_targets += \
+ custom_target('unicode_category_table.h',
+ input: [unicode_data['UnicodeData.txt']],
+ output: ['unicode_category_table.h'],
+ command: [
+ perl, files('generate-unicode_category_table.pl'),
+ '--outdir', '@OUTDIR@', '@INPUT@'],
+ build_by_default: false,
+ )
+
+update_unicode_targets += \
custom_target('unicode_norm_table.h',
input: [unicode_data['UnicodeData.txt'], unicode_data['CompositionExclusions.txt']],
output: ['unicode_norm_table.h', 'unicode_norm_hashfunc.h'],
@@ -73,6 +92,17 @@ norm_test_table = custom_target('norm_test_table.h',
inc = include_directories('.')
+category_test = executable('category_test',
+ ['category_test.c'],
+ dependencies: [frontend_port_code, icu],
+ include_directories: inc,
+ link_with: [common_static, pgport_static],
+ build_by_default: false,
+ kwargs: default_bin_args + {
+ 'install': false,
+ }
+)
+
norm_test = executable('norm_test',
['norm_test.c', norm_test_table],
dependencies: [frontend_port_code],
@@ -87,6 +117,16 @@ norm_test = executable('norm_test',
update_unicode_dep = []
if not meson.is_cross_build()
+ update_unicode_dep += custom_target('category_test.run',
+ output: 'category_test.run',
+ input: update_unicode_targets,
+ command: [category_test, UNICODE_VERSION],
+ build_by_default: false,
+ build_always_stale: true,
+ )
+endif
+
+if not meson.is_cross_build()
update_unicode_dep += custom_target('norm_test.run',
output: 'norm_test.run',
input: update_unicode_targets,
diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c
index 809a6dee545..b6097b912a6 100644
--- a/src/common/unicode/norm_test.c
+++ b/src/common/unicode/norm_test.c
@@ -81,6 +81,6 @@ main(int argc, char **argv)
}
}
- printf("All tests successful!\n");
+ printf("norm_test: All tests successful!\n");
exit(0);
}