diff options
Diffstat (limited to 'src/common/unicode')
-rw-r--r-- | src/common/unicode/Makefile | 19 | ||||
-rw-r--r-- | src/common/unicode/category_test.c | 108 | ||||
-rw-r--r-- | src/common/unicode/generate-unicode_category_table.pl | 204 | ||||
-rw-r--r-- | src/common/unicode/generate-unicode_version.pl | 46 | ||||
-rw-r--r-- | src/common/unicode/meson.build | 40 | ||||
-rw-r--r-- | src/common/unicode/norm_test.c | 2 |
6 files changed, 416 insertions, 3 deletions
diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile index 382da476cf9..27a7d5a807e 100644 --- a/src/common/unicode/Makefile +++ b/src/common/unicode/Makefile @@ -15,11 +15,15 @@ include $(top_builddir)/src/Makefile.global override CPPFLAGS := -DFRONTEND -I. $(CPPFLAGS) LIBS += $(PTHREAD_LIBS) +LDFLAGS_INTERNAL += $(ICU_LIBS) +CPPFLAGS += $(ICU_CFLAGS) + # By default, do nothing. all: -update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h +update-unicode: unicode_category_table.h unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h unicode_version.h mv $^ $(top_srcdir)/src/include/common/ + $(MAKE) category-check $(MAKE) normalization-check # These files are part of the Unicode Character Database. Download @@ -28,6 +32,12 @@ update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asi UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global $(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F) +unicode_version.h: generate-unicode_version.pl + $(PERL) $< --version $(UNICODE_VERSION) + +unicode_category_table.h: generate-unicode_category_table.pl UnicodeData.txt + $(PERL) $< + # Generation of conversion tables used for string normalization with # UTF-8 strings. unicode_norm_hashfunc.h: unicode_norm_table.h @@ -45,9 +55,14 @@ unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizat $(PERL) $^ >$@ # Test suite +category-check: category_test + ./category_test + normalization-check: norm_test ./norm_test +category_test: category_test.o ../unicode_category.o | submake-common + norm_test: norm_test.o ../unicode_norm.o | submake-common norm_test.o: norm_test_table.h @@ -64,7 +79,7 @@ norm_test_table.h: generate-norm_test_table.pl NormalizationTest.txt clean: - rm -f $(OBJS) norm_test norm_test.o + rm -f $(OBJS) category_test category_test.o norm_test norm_test.o distclean: clean rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h diff --git a/src/common/unicode/category_test.c b/src/common/unicode/category_test.c new file mode 100644 index 00000000000..ba62716d456 --- /dev/null +++ b/src/common/unicode/category_test.c @@ -0,0 +1,108 @@ +/*------------------------------------------------------------------------- + * category_test.c + * Program to test Unicode general category functions. + * + * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/common/unicode/category_test.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#ifdef USE_ICU +#include <unicode/uchar.h> +#endif +#include "common/unicode_category.h" +#include "common/unicode_version.h" + +/* + * Parse version into integer for easy comparison. + */ +#ifdef USE_ICU +static int +parse_unicode_version(const char *version) +{ + int n, + major, + minor; + + n = sscanf(version, "%d.%d", &major, &minor); + + Assert(n == 2); + Assert(minor < 100); + + return major * 100 + minor; +} +#endif + +/* + * Exhaustively test that the Unicode category for each codepoint matches that + * returned by ICU. + */ +int +main(int argc, char **argv) +{ +#ifdef USE_ICU + int pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION); + int icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION); + int pg_skipped_codepoints = 0; + int icu_skipped_codepoints = 0; + + printf("Postgres Unicode Version:\t%s\n", PG_UNICODE_VERSION); + printf("ICU Unicode Version:\t\t%s\n", U_UNICODE_VERSION); + + for (UChar32 code = 0; code <= 0x10ffff; code++) + { + uint8_t pg_category = unicode_category(code); + uint8_t icu_category = u_charType(code); + + if (pg_category != icu_category) + { + /* + * A version mismatch means that some assigned codepoints in the + * newer version may be unassigned in the older version. That's + * OK, though the test will not cover those codepoints marked + * unassigned in the older version (that is, it will no longer be + * an exhaustive test). + */ + if (pg_category == PG_U_UNASSIGNED && + pg_unicode_version < icu_unicode_version) + pg_skipped_codepoints++; + else if (icu_category == PG_U_UNASSIGNED && + icu_unicode_version < pg_unicode_version) + icu_skipped_codepoints++; + else + { + printf("FAILURE for codepoint %06x\n", code); + printf("Postgres category: %02d %s %s\n", pg_category, + unicode_category_abbrev(pg_category), + unicode_category_string(pg_category)); + printf("ICU category: %02d %s %s\n", icu_category, + unicode_category_abbrev(icu_category), + unicode_category_string(icu_category)); + printf("\n"); + exit(1); + } + } + } + + if (pg_skipped_codepoints > 0) + printf("Skipped %d codepoints unassigned in Postgres due to Unicode version mismatch.\n", + pg_skipped_codepoints); + if (icu_skipped_codepoints > 0) + printf("Skipped %d codepoints unassigned in ICU due to Unicode version mismatch.\n", + icu_skipped_codepoints); + + printf("category_test: All tests successful!\n"); + exit(0); +#else + printf("ICU support required for test; skipping.\n"); + exit(0); +#endif +} diff --git a/src/common/unicode/generate-unicode_category_table.pl b/src/common/unicode/generate-unicode_category_table.pl new file mode 100644 index 00000000000..8f03425e0bf --- /dev/null +++ b/src/common/unicode/generate-unicode_category_table.pl @@ -0,0 +1,204 @@ +#!/usr/bin/perl +# +# Generate a code point category table and its lookup utilities, using +# Unicode data files as input. +# +# Input: UnicodeData.txt +# Output: unicode_category_table.h +# +# Copyright (c) 2000-2023, PostgreSQL Global Development Group + +use strict; +use warnings; +use Getopt::Long; + +use FindBin; +use lib "$FindBin::RealBin/../../tools/"; + +my $CATEGORY_UNASSIGNED = 'Cn'; + +my $output_path = '.'; + +GetOptions('outdir:s' => \$output_path); + +my $output_table_file = "$output_path/unicode_category_table.h"; + +my $FH; + +# Read entries from UnicodeData.txt into a list of codepoint ranges +# and their general category. +my @category_ranges = (); +my $range_start = undef; +my $range_end = undef; +my $range_category = undef; + +# If between a "<..., First>" entry and a "<..., Last>" entry, the gap in +# codepoints represents a range, and $gap_category is equal to the +# category for both (which must match). Otherwise, the gap represents +# unassigned code points. +my $gap_category = undef; + +open($FH, '<', "$output_path/UnicodeData.txt") + or die "Could not open $output_path/UnicodeData.txt: $!."; +while (my $line = <$FH>) +{ + my @elts = split(';', $line); + my $code = hex($elts[0]); + my $name = $elts[1]; + my $category = $elts[2]; + + die "codepoint out of range" if $code > 0x10FFFF; + die "unassigned codepoint in UnicodeData.txt" if $category eq $CATEGORY_UNASSIGNED; + + if (!defined($range_start)) { + my $code_str = sprintf "0x%06x", $code; + die if defined($range_end) || defined($range_category) || defined($gap_category); + die "unexpected first entry <..., Last>" if ($name =~ /Last>/); + die "expected 0x000000 for first entry, got $code_str" if $code != 0x000000; + + # initialize + $range_start = $code; + $range_end = $code; + $range_category = $category; + if ($name =~ /<.*, First>$/) { + $gap_category = $category; + } else { + $gap_category = $CATEGORY_UNASSIGNED; + } + next; + } + + # Gap in codepoints detected. If it's a different category than + # the current range, emit the current range and initialize a new + # range representing the gap. + if ($range_end + 1 != $code && $range_category ne $gap_category) { + push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category}); + $range_start = $range_end + 1; + $range_end = $code - 1; + $range_category = $gap_category; + } + + # different category; new range + if ($range_category ne $category) { + push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category}); + $range_start = $code; + $range_end = $code; + $range_category = $category; + } + + if ($name =~ /<.*, First>$/) { + die "<..., First> entry unexpectedly follows another <..., First> entry" + if $gap_category ne $CATEGORY_UNASSIGNED; + $gap_category = $category; + } + elsif ($name =~ /<.*, Last>$/) { + die "<..., First> and <..., Last> entries have mismatching general category" + if $gap_category ne $category; + $gap_category = $CATEGORY_UNASSIGNED; + } + else { + die "unexpected entry found between <..., First> and <..., Last>" + if $gap_category ne $CATEGORY_UNASSIGNED; + } + + $range_end = $code; +} +close $FH; + +die "<..., First> entry with no corresponding <..., Last> entry" + if $gap_category ne $CATEGORY_UNASSIGNED; + +# emit final range +push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category}); + +# emit range for any unassigned code points after last entry +if ($range_end < 0x10FFFF) { + $range_start = $range_end + 1; + $range_end = 0x10FFFF; + $range_category = $CATEGORY_UNASSIGNED; + push(@category_ranges, {start => $range_start, end => $range_end, category => $range_category}); +} + +my $num_ranges = scalar @category_ranges; + +# See: https://www.unicode.org/reports/tr44/#General_Category_Values +my $categories = { + Cn => 'PG_U_UNASSIGNED', + Lu => 'PG_U_UPPERCASE_LETTER', + Ll => 'PG_U_LOWERCASE_LETTER', + Lt => 'PG_U_TITLECASE_LETTER', + Lm => 'PG_U_MODIFIER_LETTER', + Lo => 'PG_U_OTHER_LETTER', + Mn => 'PG_U_NONSPACING_MARK', + Me => 'PG_U_ENCLOSING_MARK', + Mc => 'PG_U_SPACING_MARK', + Nd => 'PG_U_DECIMAL_NUMBER', + Nl => 'PG_U_LETTER_NUMBER', + No => 'PG_U_OTHER_NUMBER', + Zs => 'PG_U_SPACE_SEPARATOR', + Zl => 'PG_U_LINE_SEPARATOR', + Zp => 'PG_U_PARAGRAPH_SEPARATOR', + Cc => 'PG_U_CONTROL', + Cf => 'PG_U_FORMAT', + Co => 'PG_U_PRIVATE_USE', + Cs => 'PG_U_SURROGATE', + Pd => 'PG_U_DASH_PUNCTUATION', + Ps => 'PG_U_OPEN_PUNCTUATION', + Pe => 'PG_U_CLOSE_PUNCTUATION', + Pc => 'PG_U_CONNECTOR_PUNCTUATION', + Po => 'PG_U_OTHER_PUNCTUATION', + Sm => 'PG_U_MATH_SYMBOL', + Sc => 'PG_U_CURRENCY_SYMBOL', + Sk => 'PG_U_MODIFIER_SYMBOL', + So => 'PG_U_OTHER_SYMBOL', + Pi => 'PG_U_INITIAL_PUNCTUATION', + Pf => 'PG_U_FINAL_PUNCTUATION' +}; + +# Start writing out the output files +open my $OT, '>', $output_table_file + or die "Could not open output file $output_table_file: $!\n"; + +print $OT <<HEADER; +/*------------------------------------------------------------------------- + * + * unicode_category_table.h + * Category table for Unicode character classification. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/common/unicode_category_table.h + * + *------------------------------------------------------------------------- + */ + +#include "common/unicode_category.h" + +/* + * File auto-generated by src/common/unicode/generate-unicode_category_table.pl, + * do not edit. There is deliberately not an #ifndef PG_UNICODE_CATEGORY_TABLE_H + * here. + */ +typedef struct +{ + uint32 first; /* Unicode codepoint */ + uint32 last; /* Unicode codepoint */ + uint8 category; /* General Category */ +} pg_category_range; + +/* table of Unicode codepoint ranges and their categories */ +static const pg_category_range unicode_categories[$num_ranges] = +{ +HEADER + +my $firsttime = 1; +foreach my $range (@category_ranges) { + printf $OT ",\n" unless $firsttime; + $firsttime = 0; + + my $category = $categories->{$range->{category}}; + die "category missing: $range->{category}" unless $category; + printf $OT "\t{0x%06x, 0x%06x, %s}", $range->{start}, $range->{end}, $category; +} +print $OT "\n};\n"; diff --git a/src/common/unicode/generate-unicode_version.pl b/src/common/unicode/generate-unicode_version.pl new file mode 100644 index 00000000000..22eb2f1a3dd --- /dev/null +++ b/src/common/unicode/generate-unicode_version.pl @@ -0,0 +1,46 @@ +#!/usr/bin/perl +# +# Generate header file with Unicode version used by Postgres. +# +# Output: unicode_version.h +# +# Copyright (c) 2000-2023, PostgreSQL Global Development Group + +use strict; +use warnings; +use Getopt::Long; + +use FindBin; +use lib "$FindBin::RealBin/../../tools/"; + +my $output_path = '.'; +my $version_str = undef; + +GetOptions('outdir:s' => \$output_path, 'version:s' => \$version_str); + +my @version_parts = split /\./, $version_str; + +my $unicode_version_str = sprintf "%d.%d", $version_parts[0], $version_parts[1]; + +my $output_file = "$output_path/unicode_version.h"; + +# Start writing out the output files +open my $OT, '>', $output_file + or die "Could not open output file $output_file: $!\n"; + +print $OT <<HEADER; +/*------------------------------------------------------------------------- + * + * unicode_version.h + * Unicode version used by Postgres. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/common/unicode_version.h + * + *------------------------------------------------------------------------- + */ + +#define PG_UNICODE_VERSION "$unicode_version_str" +HEADER diff --git a/src/common/unicode/meson.build b/src/common/unicode/meson.build index 357ca2f9fb3..6af46122c4e 100644 --- a/src/common/unicode/meson.build +++ b/src/common/unicode/meson.build @@ -25,6 +25,25 @@ endforeach update_unicode_targets = [] update_unicode_targets += \ + custom_target('unicode_version.h', + output: ['unicode_version.h'], + command: [ + perl, files('generate-unicode_version.pl'), + '--outdir', '@OUTDIR@', '--version', UNICODE_VERSION], + build_by_default: false, + ) + +update_unicode_targets += \ + custom_target('unicode_category_table.h', + input: [unicode_data['UnicodeData.txt']], + output: ['unicode_category_table.h'], + command: [ + perl, files('generate-unicode_category_table.pl'), + '--outdir', '@OUTDIR@', '@INPUT@'], + build_by_default: false, + ) + +update_unicode_targets += \ custom_target('unicode_norm_table.h', input: [unicode_data['UnicodeData.txt'], unicode_data['CompositionExclusions.txt']], output: ['unicode_norm_table.h', 'unicode_norm_hashfunc.h'], @@ -73,6 +92,17 @@ norm_test_table = custom_target('norm_test_table.h', inc = include_directories('.') +category_test = executable('category_test', + ['category_test.c'], + dependencies: [frontend_port_code, icu], + include_directories: inc, + link_with: [common_static, pgport_static], + build_by_default: false, + kwargs: default_bin_args + { + 'install': false, + } +) + norm_test = executable('norm_test', ['norm_test.c', norm_test_table], dependencies: [frontend_port_code], @@ -87,6 +117,16 @@ norm_test = executable('norm_test', update_unicode_dep = [] if not meson.is_cross_build() + update_unicode_dep += custom_target('category_test.run', + output: 'category_test.run', + input: update_unicode_targets, + command: [category_test, UNICODE_VERSION], + build_by_default: false, + build_always_stale: true, + ) +endif + +if not meson.is_cross_build() update_unicode_dep += custom_target('norm_test.run', output: 'norm_test.run', input: update_unicode_targets, diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c index 809a6dee545..b6097b912a6 100644 --- a/src/common/unicode/norm_test.c +++ b/src/common/unicode/norm_test.c @@ -81,6 +81,6 @@ main(int argc, char **argv) } } - printf("All tests successful!\n"); + printf("norm_test: All tests successful!\n"); exit(0); } |