diff options
Diffstat (limited to 'src/common/unicode_category.c')
-rw-r--r-- | src/common/unicode_category.c | 195 |
1 files changed, 195 insertions, 0 deletions
diff --git a/src/common/unicode_category.c b/src/common/unicode_category.c new file mode 100644 index 00000000000..cec9c0d998f --- /dev/null +++ b/src/common/unicode_category.c @@ -0,0 +1,195 @@ +/*------------------------------------------------------------------------- + * unicode_category.c + * Determine general category of Unicode characters. + * + * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/common/unicode_category.c + * + *------------------------------------------------------------------------- + */ +#ifndef FRONTEND +#include "postgres.h" +#else +#include "postgres_fe.h" +#endif + +#include "common/unicode_category.h" +#include "common/unicode_category_table.h" + +/* + * Unicode general category for the given codepoint. + */ +pg_unicode_category +unicode_category(pg_wchar ucs) +{ + int min = 0; + int mid; + int max = lengthof(unicode_categories) - 1; + + Assert(ucs >= unicode_categories[0].first && + ucs <= unicode_categories[max].last); + + while (max >= min) + { + mid = (min + max) / 2; + if (ucs > unicode_categories[mid].last) + min = mid + 1; + else if (ucs < unicode_categories[mid].first) + max = mid - 1; + else + return unicode_categories[mid].category; + } + + Assert(false); + return (pg_unicode_category) - 1; +} + +/* + * Description of Unicode general category. + */ +const char * +unicode_category_string(pg_unicode_category category) +{ + switch (category) + { + case PG_U_UNASSIGNED: + return "Unassigned"; + case PG_U_UPPERCASE_LETTER: + return "Uppercase_Letter"; + case PG_U_LOWERCASE_LETTER: + return "Lowercase_Letter"; + case PG_U_TITLECASE_LETTER: + return "Titlecase_Letter"; + case PG_U_MODIFIER_LETTER: + return "Modifier_Letter"; + case PG_U_OTHER_LETTER: + return "Other_Letter"; + case PG_U_NONSPACING_MARK: + return "Nonspacing_Mark"; + case PG_U_ENCLOSING_MARK: + return "Enclosing_Mark"; + case PG_U_SPACING_MARK: + return "Spacing_Mark"; + case PG_U_DECIMAL_NUMBER: + return "Decimal_Number"; + case PG_U_LETTER_NUMBER: + return "Letter_Number"; + case PG_U_OTHER_NUMBER: + return "Other_Number"; + case PG_U_SPACE_SEPARATOR: + return "Space_Separator"; + case PG_U_LINE_SEPARATOR: + return "Line_Separator"; + case PG_U_PARAGRAPH_SEPARATOR: + return "Paragraph_Separator"; + case PG_U_CONTROL: + return "Control"; + case PG_U_FORMAT: + return "Format"; + case PG_U_PRIVATE_USE: + return "Private_Use"; + case PG_U_SURROGATE: + return "Surrogate"; + case PG_U_DASH_PUNCTUATION: + return "Dash_Punctuation"; + case PG_U_OPEN_PUNCTUATION: + return "Open_Punctuation"; + case PG_U_CLOSE_PUNCTUATION: + return "Close_Punctuation"; + case PG_U_CONNECTOR_PUNCTUATION: + return "Connector_Punctuation"; + case PG_U_OTHER_PUNCTUATION: + return "Other_Punctuation"; + case PG_U_MATH_SYMBOL: + return "Math_Symbol"; + case PG_U_CURRENCY_SYMBOL: + return "Currency_Symbol"; + case PG_U_MODIFIER_SYMBOL: + return "Modifier_Symbol"; + case PG_U_OTHER_SYMBOL: + return "Other_Symbol"; + case PG_U_INITIAL_PUNCTUATION: + return "Initial_Punctuation"; + case PG_U_FINAL_PUNCTUATION: + return "Final_Punctuation"; + } + + Assert(false); + return "Unrecognized"; /* keep compiler quiet */ +} + +/* + * Short code for Unicode general category. + */ +const char * +unicode_category_abbrev(pg_unicode_category category) +{ + switch (category) + { + case PG_U_UNASSIGNED: + return "Cn"; + case PG_U_UPPERCASE_LETTER: + return "Lu"; + case PG_U_LOWERCASE_LETTER: + return "Ll"; + case PG_U_TITLECASE_LETTER: + return "Lt"; + case PG_U_MODIFIER_LETTER: + return "Lm"; + case PG_U_OTHER_LETTER: + return "Lo"; + case PG_U_NONSPACING_MARK: + return "Mn"; + case PG_U_ENCLOSING_MARK: + return "Me"; + case PG_U_SPACING_MARK: + return "Mc"; + case PG_U_DECIMAL_NUMBER: + return "Nd"; + case PG_U_LETTER_NUMBER: + return "Nl"; + case PG_U_OTHER_NUMBER: + return "No"; + case PG_U_SPACE_SEPARATOR: + return "Zs"; + case PG_U_LINE_SEPARATOR: + return "Zl"; + case PG_U_PARAGRAPH_SEPARATOR: + return "Zp"; + case PG_U_CONTROL: + return "Cc"; + case PG_U_FORMAT: + return "Cf"; + case PG_U_PRIVATE_USE: + return "Co"; + case PG_U_SURROGATE: + return "Cs"; + case PG_U_DASH_PUNCTUATION: + return "Pd"; + case PG_U_OPEN_PUNCTUATION: + return "Ps"; + case PG_U_CLOSE_PUNCTUATION: + return "Pe"; + case PG_U_CONNECTOR_PUNCTUATION: + return "Pc"; + case PG_U_OTHER_PUNCTUATION: + return "Po"; + case PG_U_MATH_SYMBOL: + return "Sm"; + case PG_U_CURRENCY_SYMBOL: + return "Sc"; + case PG_U_MODIFIER_SYMBOL: + return "Sk"; + case PG_U_OTHER_SYMBOL: + return "So"; + case PG_U_INITIAL_PUNCTUATION: + return "Pi"; + case PG_U_FINAL_PUNCTUATION: + return "Pf"; + } + + Assert(false); + return "??"; /* keep compiler quiet */ +} |