diff options
Diffstat (limited to 'src/common/unicode/category_test.c')
-rw-r--r-- | src/common/unicode/category_test.c | 222 |
1 files changed, 179 insertions, 43 deletions
diff --git a/src/common/unicode/category_test.c b/src/common/unicode/category_test.c index f1aaac0f613..e823044d63a 100644 --- a/src/common/unicode/category_test.c +++ b/src/common/unicode/category_test.c @@ -1,6 +1,6 @@ /*------------------------------------------------------------------------- * category_test.c - * Program to test Unicode general category functions. + * Program to test Unicode general category and character properties. * * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group * @@ -14,17 +14,23 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <wctype.h> #ifdef USE_ICU #include <unicode/uchar.h> #endif + #include "common/unicode_category.h" #include "common/unicode_version.h" +static int pg_unicode_version = 0; +#ifdef USE_ICU +static int icu_unicode_version = 0; +#endif + /* * Parse version into integer for easy comparison. */ -#ifdef USE_ICU static int parse_unicode_version(const char *version) { @@ -39,57 +45,175 @@ parse_unicode_version(const char *version) return major * 100 + minor; } -#endif +#ifdef USE_ICU /* - * Exhaustively test that the Unicode category for each codepoint matches that - * returned by ICU. + * Test Postgres Unicode tables by comparing with ICU. Test the General + * Category, as well as the properties Alphabetic, Lowercase, Uppercase, + * White_Space, and Hex_Digit. */ -int -main(int argc, char **argv) +static void +icu_test() { -#ifdef USE_ICU - int pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION); - int icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION); + int successful = 0; int pg_skipped_codepoints = 0; int icu_skipped_codepoints = 0; - printf("category_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION); - printf("category_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION); - - for (UChar32 code = 0; code <= 0x10ffff; code++) + for (pg_wchar code = 0; code <= 0x10ffff; code++) { uint8_t pg_category = unicode_category(code); uint8_t icu_category = u_charType(code); + /* Property tests */ + bool prop_alphabetic = pg_u_prop_alphabetic(code); + bool prop_lowercase = pg_u_prop_lowercase(code); + bool prop_uppercase = pg_u_prop_uppercase(code); + bool prop_cased = pg_u_prop_cased(code); + bool prop_case_ignorable = pg_u_prop_case_ignorable(code); + bool prop_white_space = pg_u_prop_white_space(code); + bool prop_hex_digit = pg_u_prop_hex_digit(code); + bool prop_join_control = pg_u_prop_join_control(code); + + bool icu_prop_alphabetic = u_hasBinaryProperty( + code, UCHAR_ALPHABETIC); + bool icu_prop_lowercase = u_hasBinaryProperty( + code, UCHAR_LOWERCASE); + bool icu_prop_uppercase = u_hasBinaryProperty( + code, UCHAR_UPPERCASE); + bool icu_prop_cased = u_hasBinaryProperty( + code, UCHAR_CASED); + bool icu_prop_case_ignorable = u_hasBinaryProperty( + code, UCHAR_CASE_IGNORABLE); + bool icu_prop_white_space = u_hasBinaryProperty( + code, UCHAR_WHITE_SPACE); + bool icu_prop_hex_digit = u_hasBinaryProperty( + code, UCHAR_HEX_DIGIT); + bool icu_prop_join_control = u_hasBinaryProperty( + code, UCHAR_JOIN_CONTROL); + + /* + * Compare with ICU for character classes using: + * + * https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/uchar_8h.html#details + * + * which describes how to use ICU to test for membership in regex + * character classes. + * + * NB: the document suggests testing for some properties such as + * UCHAR_POSIX_ALNUM, but that doesn't mean that we're testing for the + * "POSIX Compatible" character classes. + */ + bool isalpha = pg_u_isalpha(code); + bool islower = pg_u_islower(code); + bool isupper = pg_u_isupper(code); + bool ispunct = pg_u_ispunct(code, false); + bool isdigit = pg_u_isdigit(code, false); + bool isxdigit = pg_u_isxdigit(code, false); + bool isalnum = pg_u_isalnum(code, false); + bool isspace = pg_u_isspace(code); + bool isblank = pg_u_isblank(code); + bool iscntrl = pg_u_iscntrl(code); + bool isgraph = pg_u_isgraph(code); + bool isprint = pg_u_isprint(code); + + bool icu_isalpha = u_isUAlphabetic(code); + bool icu_islower = u_isULowercase(code); + bool icu_isupper = u_isUUppercase(code); + bool icu_ispunct = u_ispunct(code); + bool icu_isdigit = u_isdigit(code); + bool icu_isxdigit = u_hasBinaryProperty(code, + UCHAR_POSIX_XDIGIT); + bool icu_isalnum = u_hasBinaryProperty(code, + UCHAR_POSIX_ALNUM); + bool icu_isspace = u_isUWhiteSpace(code); + bool icu_isblank = u_isblank(code); + bool icu_iscntrl = icu_category == PG_U_CONTROL; + bool icu_isgraph = u_hasBinaryProperty(code, + UCHAR_POSIX_GRAPH); + bool icu_isprint = u_hasBinaryProperty(code, + UCHAR_POSIX_PRINT); + + /* + * A version mismatch means that some assigned codepoints in the newer + * version may be unassigned in the older version. That's OK, though + * the test will not cover those codepoints marked unassigned in the + * older version (that is, it will no longer be an exhaustive test). + */ + if (pg_category == PG_U_UNASSIGNED && + icu_category != PG_U_UNASSIGNED && + pg_unicode_version < icu_unicode_version) + { + pg_skipped_codepoints++; + continue; + } + + if (icu_category == PG_U_UNASSIGNED && + pg_category != PG_U_UNASSIGNED && + icu_unicode_version < pg_unicode_version) + { + icu_skipped_codepoints++; + continue; + } + if (pg_category != icu_category) { - /* - * A version mismatch means that some assigned codepoints in the - * newer version may be unassigned in the older version. That's - * OK, though the test will not cover those codepoints marked - * unassigned in the older version (that is, it will no longer be - * an exhaustive test). - */ - if (pg_category == PG_U_UNASSIGNED && - pg_unicode_version < icu_unicode_version) - pg_skipped_codepoints++; - else if (icu_category == PG_U_UNASSIGNED && - icu_unicode_version < pg_unicode_version) - icu_skipped_codepoints++; - else - { - printf("category_test: FAILURE for codepoint 0x%06x\n", code); - printf("category_test: Postgres category: %02d %s %s\n", pg_category, - unicode_category_abbrev(pg_category), - unicode_category_string(pg_category)); - printf("category_test: ICU category: %02d %s %s\n", icu_category, - unicode_category_abbrev(icu_category), - unicode_category_string(icu_category)); - printf("\n"); - exit(1); - } + printf("category_test: FAILURE for codepoint 0x%06x\n", code); + printf("category_test: Postgres category: %02d %s %s\n", pg_category, + unicode_category_abbrev(pg_category), + unicode_category_string(pg_category)); + printf("category_test: ICU category: %02d %s %s\n", icu_category, + unicode_category_abbrev(icu_category), + unicode_category_string(icu_category)); + printf("\n"); + exit(1); + } + + if (prop_alphabetic != icu_prop_alphabetic || + prop_lowercase != icu_prop_lowercase || + prop_uppercase != icu_prop_uppercase || + prop_cased != icu_prop_cased || + prop_case_ignorable != icu_prop_case_ignorable || + prop_white_space != icu_prop_white_space || + prop_hex_digit != icu_prop_hex_digit || + prop_join_control != icu_prop_join_control) + { + printf("category_test: FAILURE for codepoint 0x%06x\n", code); + printf("category_test: Postgres property alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n", + prop_alphabetic, prop_lowercase, prop_uppercase, + prop_cased, prop_case_ignorable, + prop_white_space, prop_hex_digit, prop_join_control); + printf("category_test: ICU property alphabetic/lowercase/uppercase/cased/case_ignorable/white_space/hex_digit/join_control: %d/%d/%d/%d/%d/%d/%d/%d\n", + icu_prop_alphabetic, icu_prop_lowercase, icu_prop_uppercase, + icu_prop_cased, icu_prop_case_ignorable, + icu_prop_white_space, icu_prop_hex_digit, icu_prop_join_control); + printf("\n"); + exit(1); } + + if (isalpha != icu_isalpha || + islower != icu_islower || + isupper != icu_isupper || + ispunct != icu_ispunct || + isdigit != icu_isdigit || + isxdigit != icu_isxdigit || + isalnum != icu_isalnum || + isspace != icu_isspace || + isblank != icu_isblank || + iscntrl != icu_iscntrl || + isgraph != icu_isgraph || + isprint != icu_isprint) + { + printf("category_test: FAILURE for codepoint 0x%06x\n", code); + printf("category_test: Postgres class alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n", + isalpha, islower, isupper, ispunct, isdigit, isxdigit, isalnum, isspace, isblank, iscntrl, isgraph, isprint); + printf("category_test: ICU class alpha/lower/upper/punct/digit/xdigit/alnum/space/blank/cntrl/graph/print: %d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d/%d\n", + icu_isalpha, icu_islower, icu_isupper, icu_ispunct, icu_isdigit, icu_isxdigit, icu_isalnum, icu_isspace, icu_isblank, icu_iscntrl, icu_isgraph, icu_isprint); + printf("\n"); + exit(1); + } + + if (pg_category != PG_U_UNASSIGNED) + successful++; } if (pg_skipped_codepoints > 0) @@ -99,10 +223,22 @@ main(int argc, char **argv) printf("category_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n", icu_skipped_codepoints); - printf("category_test: success\n"); - exit(0); + printf("category_test: ICU test: %d codepoints successful\n", successful); +} +#endif + +int +main(int argc, char **argv) +{ + pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION); + printf("category_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION); + +#ifdef USE_ICU + icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION); + printf("category_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION); + + icu_test(); #else - printf("category_test: ICU support required for test; skipping\n"); - exit(0); + printf("category_test: ICU not available; skipping\n"); #endif } |