aboutsummaryrefslogtreecommitdiff
path: root/src/common/unicode_category.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/common/unicode_category.c')
-rw-r--r--src/common/unicode_category.c195
1 files changed, 195 insertions, 0 deletions
diff --git a/src/common/unicode_category.c b/src/common/unicode_category.c
new file mode 100644
index 00000000000..cec9c0d998f
--- /dev/null
+++ b/src/common/unicode_category.c
@@ -0,0 +1,195 @@
+/*-------------------------------------------------------------------------
+ * unicode_category.c
+ * Determine general category of Unicode characters.
+ *
+ * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/common/unicode_category.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef FRONTEND
+#include "postgres.h"
+#else
+#include "postgres_fe.h"
+#endif
+
+#include "common/unicode_category.h"
+#include "common/unicode_category_table.h"
+
+/*
+ * Unicode general category for the given codepoint.
+ */
+pg_unicode_category
+unicode_category(pg_wchar ucs)
+{
+ int min = 0;
+ int mid;
+ int max = lengthof(unicode_categories) - 1;
+
+ Assert(ucs >= unicode_categories[0].first &&
+ ucs <= unicode_categories[max].last);
+
+ while (max >= min)
+ {
+ mid = (min + max) / 2;
+ if (ucs > unicode_categories[mid].last)
+ min = mid + 1;
+ else if (ucs < unicode_categories[mid].first)
+ max = mid - 1;
+ else
+ return unicode_categories[mid].category;
+ }
+
+ Assert(false);
+ return (pg_unicode_category) - 1;
+}
+
+/*
+ * Description of Unicode general category.
+ */
+const char *
+unicode_category_string(pg_unicode_category category)
+{
+ switch (category)
+ {
+ case PG_U_UNASSIGNED:
+ return "Unassigned";
+ case PG_U_UPPERCASE_LETTER:
+ return "Uppercase_Letter";
+ case PG_U_LOWERCASE_LETTER:
+ return "Lowercase_Letter";
+ case PG_U_TITLECASE_LETTER:
+ return "Titlecase_Letter";
+ case PG_U_MODIFIER_LETTER:
+ return "Modifier_Letter";
+ case PG_U_OTHER_LETTER:
+ return "Other_Letter";
+ case PG_U_NONSPACING_MARK:
+ return "Nonspacing_Mark";
+ case PG_U_ENCLOSING_MARK:
+ return "Enclosing_Mark";
+ case PG_U_SPACING_MARK:
+ return "Spacing_Mark";
+ case PG_U_DECIMAL_NUMBER:
+ return "Decimal_Number";
+ case PG_U_LETTER_NUMBER:
+ return "Letter_Number";
+ case PG_U_OTHER_NUMBER:
+ return "Other_Number";
+ case PG_U_SPACE_SEPARATOR:
+ return "Space_Separator";
+ case PG_U_LINE_SEPARATOR:
+ return "Line_Separator";
+ case PG_U_PARAGRAPH_SEPARATOR:
+ return "Paragraph_Separator";
+ case PG_U_CONTROL:
+ return "Control";
+ case PG_U_FORMAT:
+ return "Format";
+ case PG_U_PRIVATE_USE:
+ return "Private_Use";
+ case PG_U_SURROGATE:
+ return "Surrogate";
+ case PG_U_DASH_PUNCTUATION:
+ return "Dash_Punctuation";
+ case PG_U_OPEN_PUNCTUATION:
+ return "Open_Punctuation";
+ case PG_U_CLOSE_PUNCTUATION:
+ return "Close_Punctuation";
+ case PG_U_CONNECTOR_PUNCTUATION:
+ return "Connector_Punctuation";
+ case PG_U_OTHER_PUNCTUATION:
+ return "Other_Punctuation";
+ case PG_U_MATH_SYMBOL:
+ return "Math_Symbol";
+ case PG_U_CURRENCY_SYMBOL:
+ return "Currency_Symbol";
+ case PG_U_MODIFIER_SYMBOL:
+ return "Modifier_Symbol";
+ case PG_U_OTHER_SYMBOL:
+ return "Other_Symbol";
+ case PG_U_INITIAL_PUNCTUATION:
+ return "Initial_Punctuation";
+ case PG_U_FINAL_PUNCTUATION:
+ return "Final_Punctuation";
+ }
+
+ Assert(false);
+ return "Unrecognized"; /* keep compiler quiet */
+}
+
+/*
+ * Short code for Unicode general category.
+ */
+const char *
+unicode_category_abbrev(pg_unicode_category category)
+{
+ switch (category)
+ {
+ case PG_U_UNASSIGNED:
+ return "Cn";
+ case PG_U_UPPERCASE_LETTER:
+ return "Lu";
+ case PG_U_LOWERCASE_LETTER:
+ return "Ll";
+ case PG_U_TITLECASE_LETTER:
+ return "Lt";
+ case PG_U_MODIFIER_LETTER:
+ return "Lm";
+ case PG_U_OTHER_LETTER:
+ return "Lo";
+ case PG_U_NONSPACING_MARK:
+ return "Mn";
+ case PG_U_ENCLOSING_MARK:
+ return "Me";
+ case PG_U_SPACING_MARK:
+ return "Mc";
+ case PG_U_DECIMAL_NUMBER:
+ return "Nd";
+ case PG_U_LETTER_NUMBER:
+ return "Nl";
+ case PG_U_OTHER_NUMBER:
+ return "No";
+ case PG_U_SPACE_SEPARATOR:
+ return "Zs";
+ case PG_U_LINE_SEPARATOR:
+ return "Zl";
+ case PG_U_PARAGRAPH_SEPARATOR:
+ return "Zp";
+ case PG_U_CONTROL:
+ return "Cc";
+ case PG_U_FORMAT:
+ return "Cf";
+ case PG_U_PRIVATE_USE:
+ return "Co";
+ case PG_U_SURROGATE:
+ return "Cs";
+ case PG_U_DASH_PUNCTUATION:
+ return "Pd";
+ case PG_U_OPEN_PUNCTUATION:
+ return "Ps";
+ case PG_U_CLOSE_PUNCTUATION:
+ return "Pe";
+ case PG_U_CONNECTOR_PUNCTUATION:
+ return "Pc";
+ case PG_U_OTHER_PUNCTUATION:
+ return "Po";
+ case PG_U_MATH_SYMBOL:
+ return "Sm";
+ case PG_U_CURRENCY_SYMBOL:
+ return "Sc";
+ case PG_U_MODIFIER_SYMBOL:
+ return "Sk";
+ case PG_U_OTHER_SYMBOL:
+ return "So";
+ case PG_U_INITIAL_PUNCTUATION:
+ return "Pi";
+ case PG_U_FINAL_PUNCTUATION:
+ return "Pf";
+ }
+
+ Assert(false);
+ return "??"; /* keep compiler quiet */
+}