diff options
Diffstat (limited to 'src/common/unicode_case.c')
-rw-r--r-- | src/common/unicode_case.c | 76 |
1 files changed, 68 insertions, 8 deletions
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c index 5e77490006f..bc423b0890c 100644 --- a/src/common/unicode_case.c +++ b/src/common/unicode_case.c @@ -21,8 +21,9 @@ #include "mb/pg_wchar.h" static const pg_case_map *find_case_map(pg_wchar ucs); -static size_t convert_case(char *dst, size_t dstsize, const char *src, - ssize_t srclen, CaseKind casekind); +static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, + CaseKind str_casekind, WordBoundaryNext wbnext, + void *wbstate); pg_wchar unicode_lowercase_simple(pg_wchar code) @@ -67,7 +68,40 @@ unicode_uppercase_simple(pg_wchar code) size_t unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen) { - return convert_case(dst, dstsize, src, srclen, CaseLower); + return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL); +} + +/* + * unicode_strtitle() + * + * Convert src to titlecase, and return the result length (not including + * terminating NUL). + * + * String src must be encoded in UTF-8. If srclen < 0, src must be + * NUL-terminated. + * + * Result string is stored in dst, truncating if larger than dstsize. If + * dstsize is greater than the result length, dst will be NUL-terminated; + * otherwise not. + * + * If dstsize is zero, dst may be NULL. This is useful for calculating the + * required buffer size before allocating. + * + * Titlecasing requires knowledge about word boundaries, which is provided by + * the callback wbnext. A word boundary is the offset of the start of a word + * or the offset of the character immediately following a word. + * + * The caller is expected to initialize and free the callback state + * wbstate. The callback should first return offset 0 for the first boundary; + * then the offset of each subsequent word boundary; then the total length of + * the string to indicate the final boundary. + */ +size_t +unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, + WordBoundaryNext wbnext, void *wbstate) +{ + return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext, + wbstate); } /* @@ -89,20 +123,34 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen) size_t unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen) { - return convert_case(dst, dstsize, src, srclen, CaseUpper); + return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL); } /* - * Implement Unicode Default Case Conversion algorithm. + * If str_casekind is CaseLower or CaseUpper, map each character in the string + * for which a mapping is available. * - * Map each character in the string for which a mapping is available. + * If str_casekind is CaseTitle, maps characters found on a word boundary to + * uppercase and other characters to lowercase. */ static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, - CaseKind casekind) + CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate) { + /* character CaseKind varies while titlecasing */ + CaseKind chr_casekind = str_casekind; size_t srcoff = 0; size_t result_len = 0; + size_t boundary = 0; + + Assert((str_casekind == CaseTitle && wbnext && wbstate) || + (str_casekind != CaseTitle && !wbnext && !wbstate)); + + if (str_casekind == CaseTitle) + { + boundary = wbnext(wbstate); + Assert(boundary == 0); /* start of text is always a boundary */ + } while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0') { @@ -110,9 +158,21 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, int u1len = unicode_utf8len(u1); const pg_case_map *casemap = find_case_map(u1); + if (str_casekind == CaseTitle) + { + if (srcoff == boundary) + { + chr_casekind = CaseUpper; + boundary = wbnext(wbstate); + } + else + chr_casekind = CaseLower; + } + + /* perform mapping, update result_len, and write to dst */ if (casemap) { - pg_wchar u2 = casemap->simplemap[casekind]; + pg_wchar u2 = casemap->simplemap[chr_casekind]; pg_wchar u2len = unicode_utf8len(u2); if (result_len + u2len <= dstsize) |