From: Alexander Borisov Date: Wed, 15 Jul 2020 16:19:18 +0000 (+0300) Subject: Introduced UTF-16 according to WHATWG encoding spec. X-Git-Tag: 0.4.3~17 X-Git-Url: http://git.kaiwu.me/postgresql/log/contrib/postgres_fdw/static/gitweb.js?a=commitdiff_plain;h=29271d0ab946f04a997f78777a0254c482a3a4e6;p=njs.git Introduced UTF-16 according to WHATWG encoding spec. --- diff --git a/auto/make b/auto/make index caac3a2e..4ec533fe 100644 --- a/auto/make +++ b/auto/make @@ -241,12 +241,12 @@ lib_test: $NJS_BUILD_DIR/njs_auto_config.h \\ $NJS_BUILD_DIR/random_unit_test \\ $NJS_BUILD_DIR/rbtree_unit_test \\ $NJS_BUILD_DIR/lvlhsh_unit_test \\ - $NJS_BUILD_DIR/utf8_unit_test + $NJS_BUILD_DIR/unicode_unit_test $NJS_BUILD_DIR/random_unit_test $NJS_BUILD_DIR/rbtree_unit_test $NJS_BUILD_DIR/lvlhsh_unit_test - $NJS_BUILD_DIR/utf8_unit_test + $NJS_BUILD_DIR/unicode_unit_test unit_test: $NJS_BUILD_DIR/njs_auto_config.h \\ $NJS_BUILD_DIR/njs_unit_test diff --git a/auto/sources b/auto/sources index d5be8ef5..d9109764 100644 --- a/auto/sources +++ b/auto/sources @@ -6,6 +6,7 @@ NJS_LIB_SRCS=" \ src/njs_murmur_hash.c \ src/njs_djb_hash.c \ src/njs_utf8.c \ + src/njs_utf16.c \ src/njs_arr.c \ src/njs_rbtree.c \ src/njs_lvlhsh.c \ @@ -60,7 +61,7 @@ NJS_LIB_TEST_SRCS=" \ src/test/lvlhsh_unit_test.c \ src/test/random_unit_test.c \ src/test/rbtree_unit_test.c \ - src/test/utf8_unit_test.c \ + src/test/unicode_unit_test.c \ " NJS_TEST_SRCS=" \ diff --git a/src/njs_main.h b/src/njs_main.h index f03d7f3f..83aeb8af 100644 --- a/src/njs_main.h +++ b/src/njs_main.h @@ -14,7 +14,9 @@ #include #include #include +#include #include +#include #include #include #include diff --git a/src/njs_unicode.h b/src/njs_unicode.h new file mode 100644 index 00000000..a2d32143 --- /dev/null +++ b/src/njs_unicode.h @@ -0,0 +1,23 @@ + +/* + * Copyright (C) Alexander Borisov + * Copyright (C) NGINX, Inc. + */ + +#ifndef _NJS_UNICODE_H_INCLUDED_ +#define _NJS_UNICODE_H_INCLUDED_ + + +enum { + NJS_UNICODE_MAX_CODEPOINT = 0x10FFFF, + NJS_UNICODE_ERROR = 0x1FFFFF, + NJS_UNICODE_CONTINUE = 0x2FFFFF +}; + +typedef struct { + uint32_t codepoint; + u_char upper; +} njs_unicode_decode_t; + + +#endif /* _NJS_UNICODE_H_INCLUDED_ */ diff --git a/src/njs_utf16.c b/src/njs_utf16.c new file mode 100644 index 00000000..6626286a --- /dev/null +++ b/src/njs_utf16.c @@ -0,0 +1,116 @@ + +/* + * Copyright (C) Alexander Borisov + * Copyright (C) NGINX, Inc. + */ + + +#include + + +njs_inline void +njs_utf16_encode_write(uint32_t cp, u_char **start) +{ +#ifdef NJS_HAVE_BIG_ENDIAN + *(*start)++ = cp >> 8; + *(*start)++ = cp & 0x00FF; +#else + *(*start)++ = cp & 0x00FF; + *(*start)++ = cp >> 8; +#endif +} + + +ssize_t +njs_utf16_encode(uint32_t cp, u_char **start, const u_char *end) +{ + if ((*start + 2) > end) { + return NJS_ERROR; + } + + if (cp < 0x10000) { + njs_utf16_encode_write(cp, start); + + return 2; + } + + if ((*start + 4) > end) { + return NJS_ERROR; + } + + cp -= 0x10000; + + njs_utf16_encode_write((0xD800 | (cp >> 0x0A)), start); + njs_utf16_encode_write((0xDC00 | (cp & 0x03FF)), start); + + return 4; +} + + +uint32_t +njs_utf16_decode(njs_unicode_decode_t *ctx, const u_char **start, + const u_char *end) +{ + uint32_t unit; + unsigned lead; + + if (ctx->upper != 0x00) { + lead = ctx->upper - 0x01; + ctx->upper = 0x00; + + goto lead_state; + } + +pair_state: + + lead = *(*start)++; + + if (*start >= end) { + ctx->upper = lead + 0x01; + return NJS_UNICODE_CONTINUE; + } + +lead_state: + +#ifdef NJS_HAVE_BIG_ENDIAN + unit = (lead << 8) + *(*start)++; +#else + unit = (*(*start)++ << 8) + lead; +#endif + + if (ctx->codepoint != 0x00) { + if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) { + unit = 0x10000 + ((ctx->codepoint - 0xD800) << 10) + + (unit - 0xDC00); + + ctx->codepoint = 0x00; + + return unit; + } + + (*start)--; + + ctx->upper = lead + 0x01; + ctx->codepoint = 0x00; + + return NJS_UNICODE_ERROR; + } + + /* Surrogate pair. */ + + if ((unsigned) (unit - 0xD800) <= (0xDFFF - 0xD800)) { + if ((unsigned) (unit - 0xDC00) <= (0xDFFF - 0xDC00)) { + return NJS_UNICODE_ERROR; + } + + ctx->codepoint = unit; + + if (*start >= end) { + return NJS_UNICODE_CONTINUE; + } + + goto pair_state; + } + + return unit; +} diff --git a/src/njs_utf16.h b/src/njs_utf16.h new file mode 100644 index 00000000..b5675586 --- /dev/null +++ b/src/njs_utf16.h @@ -0,0 +1,25 @@ + +/* + * Copyright (C) Alexander Borisov + * Copyright (C) NGINX, Inc. + */ + +#ifndef _NJS_UTF16_H_INCLUDED_ +#define _NJS_UTF16_H_INCLUDED_ + + +NJS_EXPORT ssize_t njs_utf16_encode(uint32_t cp, u_char **start, + const u_char *end); +NJS_EXPORT uint32_t njs_utf16_decode(njs_unicode_decode_t *ctx, + const u_char **start, const u_char *end); + + +njs_inline void +njs_utf16_decode_init(njs_unicode_decode_t *ctx) +{ + ctx->upper = 0x00; + ctx->codepoint = 0x00; +} + + +#endif /* _NJS_UTF16_H_INCLUDED_ */ diff --git a/src/test/utf8_unit_test.c b/src/test/unicode_unit_test.c similarity index 59% rename from src/test/utf8_unit_test.c rename to src/test/unicode_unit_test.c index ff627637..1331f69b 100644 --- a/src/test/utf8_unit_test.c +++ b/src/test/unicode_unit_test.c @@ -9,7 +9,6 @@ #define NJS_UTF8_START_TEST 0xC2 -//#define NJS_UTF8_START_TEST 0 static u_char invalid[] = { @@ -87,7 +86,7 @@ utf8_unit_test(njs_uint_t start) njs_uint_t i, k, l, m; const u_char *pp; - njs_printf("utf8 unit test started\n"); + njs_printf("utf8 test started\n"); /* Test valid UTF-8. */ @@ -181,7 +180,103 @@ utf8_unit_test(njs_uint_t start) return NJS_ERROR; } - njs_printf("utf8 unit test passed\n"); + njs_printf("utf8 test passed\n"); + return NJS_OK; +} + + +static njs_int_t +utf16_unit_test() +{ + int8_t length, length_to; + u_char *start, *end, *end_to; + uint32_t cp, i; + njs_unicode_decode_t ctx; + u_char buf[8], to[4]; + + njs_printf("utf16 test started\n"); + + end = buf + sizeof(buf); + end_to = to + sizeof(to); + + for (i = 0; i <= NJS_UNICODE_MAX_CODEPOINT; i++) { + + /* Skip surrogate pair. */ + + if (i >= 0xD800 && i <= 0xDFFF) { + continue; + } + + start = buf; + + length = njs_utf16_encode(i, &start, end); + if (length < NJS_OK) { + njs_printf("utf16 test encode failed\n"); + return NJS_ERROR; + } + + njs_utf16_decode_init(&ctx); + + start = buf; + + cp = njs_utf16_decode(&ctx, (const u_char **) &start, start + length); + if (cp > NJS_UNICODE_MAX_CODEPOINT) { + njs_printf("utf16 test decode failed\n"); + return NJS_ERROR; + } + + if (cp != i) { + njs_printf("utf16 test decode code point does not match\n"); + return NJS_ERROR; + } + + start = to; + + length_to = njs_utf16_encode(cp, &start, end_to); + if (length_to < NJS_OK) { + njs_printf("utf16 test encode failed\n"); + return NJS_ERROR; + } + + if (length_to != length || njs_strncmp(buf, to, length) != 0) { + njs_printf("utf16 test decode-encode failed\n"); + return NJS_ERROR; + } + } + + /* Surrogate pair. */ + + for (i = 0xD800; i <= 0xDFFF; i++) { + start = buf; + + length = njs_utf16_encode(i, &start, end); + if (length < NJS_OK) { + njs_printf("utf16 test surrogate pair encode lead failed\n"); + return NJS_ERROR; + } + + length_to = njs_utf16_encode(i - 0xD800 + 0xDC00, &start, end); + if (length_to < NJS_OK) { + njs_printf("utf16 test surrogate pair encode failed\n"); + return NJS_ERROR; + } + + njs_utf16_decode_init(&ctx); + + start = buf; + + cp = njs_utf16_decode(&ctx, (const u_char **) &start, + start + length + length_to); + if (cp > NJS_UNICODE_MAX_CODEPOINT) { + if (i < 0xDC00) { + njs_printf("utf16 test surrogate pair decode failed\n"); + return NJS_ERROR; + } + } + } + + njs_printf("utf16 test passed\n"); + return NJS_OK; } @@ -189,8 +284,11 @@ utf8_unit_test(njs_uint_t start) int main(int argc, char **argv) { + njs_int_t ret; njs_uint_t start; + njs_printf("unicode unit test started\n"); + if (argc > 1 && argv[1][0] == 'a') { start = NJS_UTF8_START_TEST; @@ -198,5 +296,17 @@ main(int argc, char **argv) start = 256; } - return utf8_unit_test(start); + ret = utf8_unit_test(start); + if (ret != NJS_OK) { + return ret; + } + + ret = utf16_unit_test(); + if (ret != NJS_OK) { + return ret; + } + + njs_printf("unicode unit test passed\n"); + + return 0; }