From: Alexander Borisov Date: Wed, 15 Jul 2020 16:19:19 +0000 (+0300) Subject: Introduced UTF-8 decoder according to WHATWG encoding spec. X-Git-Tag: 0.4.3~16 X-Git-Url: http://git.kaiwu.me/postgresql/log/contrib/postgres_fdw/static/gitweb.js?a=commitdiff_plain;h=98c2616372ecf8bef4f145abdc565da5f51eab24;p=njs.git Introduced UTF-8 decoder according to WHATWG encoding spec. --- diff --git a/src/njs_json.c b/src/njs_json.c index 8405add1..975693b2 100644 --- a/src/njs_json.c +++ b/src/njs_json.c @@ -728,7 +728,7 @@ njs_json_parse_string(njs_json_parse_ctx_t *ctx, njs_value_t *value, if (njs_surrogate_any(utf)) { if (utf > 0xdbff || p[0] != '\\' || p[1] != 'u') { - s = njs_utf8_encode(s, NJS_UTF8_REPLACEMENT); + s = njs_utf8_encode(s, NJS_UNICODE_REPLACEMENT); continue; } @@ -741,12 +741,12 @@ njs_json_parse_string(njs_json_parse_ctx_t *ctx, njs_value_t *value, utf = njs_string_surrogate_pair(utf, utf_low); } else if (njs_surrogate_leading(utf_low)) { - utf = NJS_UTF8_REPLACEMENT; - s = njs_utf8_encode(s, NJS_UTF8_REPLACEMENT); + utf = NJS_UNICODE_REPLACEMENT; + s = njs_utf8_encode(s, NJS_UNICODE_REPLACEMENT); } else { utf = utf_low; - s = njs_utf8_encode(s, NJS_UTF8_REPLACEMENT); + s = njs_utf8_encode(s, NJS_UNICODE_REPLACEMENT); } } diff --git a/src/njs_parser.c b/src/njs_parser.c index 1f0b9673..81443db2 100644 --- a/src/njs_parser.c +++ b/src/njs_parser.c @@ -7896,11 +7896,12 @@ njs_int_t njs_parser_string_create(njs_vm_t *vm, njs_lexer_token_t *token, njs_value_t *value) { - u_char *dst; - ssize_t size, length; - uint32_t cp; - njs_str_t *src; - const u_char *p, *end; + u_char *dst; + ssize_t size, length; + uint32_t cp; + njs_str_t *src; + const u_char *p, *end; + njs_unicode_decode_t ctx; src = &token->text; @@ -7914,10 +7915,17 @@ njs_parser_string_create(njs_vm_t *vm, njs_lexer_token_t *token, p = src->start; end = src->start + src->length; + njs_utf8_decode_init(&ctx); + while (p < end) { - cp = njs_utf8_safe_decode(&p, end); + cp = njs_utf8_decode(&ctx, &p, end); - dst = njs_utf8_encode(dst, cp); + if (cp <= NJS_UNICODE_MAX_CODEPOINT) { + dst = njs_utf8_encode(dst, cp); + + } else { + dst = njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT); + } } if (length > NJS_STRING_MAP_STRIDE && size != length) { @@ -7932,12 +7940,13 @@ static njs_token_type_t njs_parser_escape_string_create(njs_parser_t *parser, njs_lexer_token_t *token, njs_value_t *value) { - u_char c, *start, *dst; - size_t size, length, hex_length; - uint64_t cp, cp_pair; - njs_int_t ret; - njs_str_t *string; - const u_char *src, *end, *hex_end; + u_char c, *start, *dst; + size_t size, length, hex_length; + uint64_t cp, cp_pair; + njs_int_t ret; + njs_str_t *string; + const u_char *src, *end, *hex_end; + njs_unicode_decode_t ctx; ret = njs_parser_escape_string_calc_length(parser, token, &size, &length); if (njs_slow_path(ret != NJS_OK)) { @@ -8053,7 +8062,13 @@ njs_parser_escape_string_create(njs_parser_t *parser, njs_lexer_token_t *token, src--; - cp = njs_utf8_safe_decode2(&src, end); + njs_utf8_decode_init(&ctx); + + cp = njs_utf8_decode(&ctx, &src, end); + if (cp > NJS_UNICODE_MAX_CODEPOINT) { + cp = NJS_UNICODE_REPLACEMENT; + } + dst = njs_utf8_encode(dst, cp); continue; @@ -8076,12 +8091,12 @@ njs_parser_escape_string_create(njs_parser_t *parser, njs_lexer_token_t *token, cp = njs_string_surrogate_pair(cp_pair, cp); } else if (njs_slow_path(njs_surrogate_leading(cp))) { - cp = NJS_UTF8_REPLACEMENT; + cp = NJS_UNICODE_REPLACEMENT; dst = njs_utf8_encode(dst, (uint32_t) cp); } else { - dst = njs_utf8_encode(dst, NJS_UTF8_REPLACEMENT); + dst = njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT); } cp_pair = 0; @@ -8092,7 +8107,7 @@ njs_parser_escape_string_create(njs_parser_t *parser, njs_lexer_token_t *token, continue; } - cp = NJS_UTF8_REPLACEMENT; + cp = NJS_UNICODE_REPLACEMENT; } dst = njs_utf8_encode(dst, (uint32_t) cp); @@ -8116,10 +8131,11 @@ static njs_int_t njs_parser_escape_string_calc_length(njs_parser_t *parser, njs_lexer_token_t *token, size_t *out_size, size_t *out_length) { - size_t size, length, hex_length; - uint64_t cp, cp_pair; - njs_str_t *string; - const u_char *ptr, *src, *end, *hex_end; + size_t size, length, hex_length; + uint64_t cp, cp_pair; + njs_str_t *string; + const u_char *ptr, *src, *end, *hex_end; + njs_unicode_decode_t ctx; size = 0; length = 0; @@ -8173,7 +8189,12 @@ njs_parser_escape_string_calc_length(njs_parser_t *parser, } if (*src >= 0x80) { - cp = njs_utf8_safe_decode2(&src, end); + njs_utf8_decode_init(&ctx); + + cp = njs_utf8_decode(&ctx, &src, end); + if (cp > NJS_UNICODE_MAX_CODEPOINT) { + cp = NJS_UNICODE_REPLACEMENT; + } size += njs_utf8_size(cp); length++; @@ -8220,13 +8241,13 @@ njs_parser_escape_string_calc_length(njs_parser_t *parser, cp = njs_string_surrogate_pair(cp_pair, cp); } else if (njs_slow_path(njs_surrogate_leading(cp))) { - cp = NJS_UTF8_REPLACEMENT; + cp = NJS_UNICODE_REPLACEMENT; size += njs_utf8_size(cp); length++; } else { - size += njs_utf8_size(NJS_UTF8_REPLACEMENT); + size += njs_utf8_size(NJS_UNICODE_REPLACEMENT); length++; } @@ -8238,7 +8259,7 @@ njs_parser_escape_string_calc_length(njs_parser_t *parser, continue; } - cp = NJS_UTF8_REPLACEMENT; + cp = NJS_UNICODE_REPLACEMENT; } size += njs_utf8_size(cp); diff --git a/src/njs_string.c b/src/njs_string.c index 74c387e8..9aa24a26 100644 --- a/src/njs_string.c +++ b/src/njs_string.c @@ -20,10 +20,8 @@ static njs_int_t njs_string_slice_prop(njs_vm_t *vm, njs_string_prop_t *string, njs_slice_prop_t *slice, njs_value_t *args, njs_uint_t nargs); static njs_int_t njs_string_slice_args(njs_vm_t *vm, njs_slice_prop_t *slice, njs_value_t *args, njs_uint_t nargs); -static njs_int_t njs_string_from_char_code(njs_vm_t *vm, - njs_value_t *args, njs_uint_t nargs, njs_index_t unused); -static njs_int_t njs_string_from_code_point(njs_vm_t *vm, njs_value_t *args, - njs_uint_t nargs, njs_index_t unused); +static njs_int_t njs_string_from_char_code(njs_vm_t *vm, njs_value_t *args, + njs_uint_t nargs, njs_index_t is_point); static njs_int_t njs_string_bytes_from(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t unused); static njs_int_t njs_string_bytes_from_array_like(njs_vm_t *vm, @@ -545,7 +543,7 @@ static const njs_object_prop_t njs_string_constructor_properties[] = { .type = NJS_PROPERTY, .name = njs_string("fromCharCode"), - .value = njs_native_function(njs_string_from_char_code, 1), + .value = njs_native_function2(njs_string_from_char_code, 1, 0), .writable = 1, .configurable = 1, }, @@ -553,7 +551,7 @@ static const njs_object_prop_t njs_string_constructor_properties[] = { .type = NJS_PROPERTY, .name = njs_string("fromCodePoint"), - .value = njs_native_function(njs_string_from_code_point, 1), + .value = njs_native_function2(njs_string_from_char_code, 1, 1), .writable = 1, .configurable = 1, }, @@ -1029,13 +1027,14 @@ static njs_int_t njs_string_prototype_to_bytes(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t unused) { - u_char *p; - size_t length; - uint32_t byte; - njs_int_t ret; - const u_char *s, *end; - njs_slice_prop_t slice; - njs_string_prop_t string; + u_char *p; + size_t length; + uint32_t byte; + njs_int_t ret; + const u_char *s, *end; + njs_slice_prop_t slice; + njs_string_prop_t string; + njs_unicode_decode_t ctx; ret = njs_string_object_validate(vm, njs_arg(args, nargs, 0)); if (njs_slow_path(ret != NJS_OK)) { @@ -1064,8 +1063,10 @@ njs_string_prototype_to_bytes(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, length = slice.length; + njs_utf8_decode_init(&ctx); + while (length != 0 && s < end) { - byte = njs_utf8_decode(&s, end); + byte = njs_utf8_decode(&ctx, &s, end); if (njs_slow_path(byte > 0xFF)) { njs_release(vm, &vm->retval); @@ -1463,13 +1464,14 @@ static njs_int_t njs_string_prototype_char_code_at(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t unused) { - double num; - size_t length; - int64_t index; - uint32_t code; - njs_int_t ret; - const u_char *start, *end; - njs_string_prop_t string; + double num; + size_t length; + int64_t index; + uint32_t code; + njs_int_t ret; + const u_char *start, *end; + njs_string_prop_t string; + njs_unicode_decode_t ctx; ret = njs_string_object_validate(vm, njs_arg(args, nargs, 0)); if (njs_slow_path(ret != NJS_OK)) { @@ -1493,10 +1495,12 @@ njs_string_prototype_char_code_at(njs_vm_t *vm, njs_value_t *args, code = string.start[index]; } else { + njs_utf8_decode_init(&ctx); + /* UTF-8 string. */ end = string.start + string.size; start = njs_string_offset(string.start, end, index); - code = njs_utf8_decode(&start, end); + code = njs_utf8_decode(&ctx, &start, end); } num = code; @@ -1829,14 +1833,27 @@ njs_decode_base64_core(njs_vm_t *vm, njs_value_t *value, const njs_str_t *src, static njs_int_t -njs_string_from_char_code(njs_vm_t *vm, njs_value_t *args, - njs_uint_t nargs, njs_index_t unused) +njs_string_from_char_code(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, + njs_index_t is_point) { - u_char *p; - size_t size; - uint16_t code; - njs_int_t ret; - njs_uint_t i; + double num; + u_char *p, *start, *end; + ssize_t len; + int32_t code; + uint32_t cp; + uint64_t length, size; + njs_int_t ret; + njs_uint_t i; + njs_unicode_decode_t ctx; + u_char buf[4]; + + size = 0; + length = 0; + + cp = 0x00; + end = buf + sizeof(buf); + + njs_utf16_decode_init(&ctx); for (i = 1; i < nargs; i++) { if (!njs_is_numeric(&args[i])) { @@ -1845,73 +1862,76 @@ njs_string_from_char_code(njs_vm_t *vm, njs_value_t *args, return ret; } } - } - size = 0; + if (is_point) { + num = njs_number(&args[i]); + if (isnan(num)) { + goto range_error; + } - for (i = 1; i < nargs; i++) { - code = njs_number_to_uint16(njs_number(&args[i])); - size += njs_utf8_size_uint16(code); - } + code = num; - p = njs_string_alloc(vm, &vm->retval, size, nargs - 1); - if (njs_slow_path(p == NULL)) { - return NJS_ERROR; - } - - for (i = 1; i < nargs; i++) { - code = njs_number_to_uint16(njs_number(&args[i])); - p = njs_utf8_encode(p, code); - } + if (code != num || code < 0 || code > 0x10FFFF) { + goto range_error; + } - return NJS_OK; -} + } else { + code = njs_number_to_uint16(njs_number(&args[i])); + } + start = buf; + len = njs_utf16_encode(code, &start, end); -static njs_int_t -njs_string_from_code_point(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, - njs_index_t unused) -{ - u_char *p; - double num; - size_t size; - int32_t code; - njs_int_t ret; - njs_uint_t i; + start = buf; + cp = njs_utf16_decode(&ctx, (const u_char **) &start, start + len); - for (i = 1; i < nargs; i++) { - if (!njs_is_numeric(&args[i])) { - ret = njs_value_to_numeric(vm, &args[i], &args[i]); - if (ret != NJS_OK) { - return ret; + if (cp > NJS_UNICODE_MAX_CODEPOINT) { + if (cp == NJS_UNICODE_CONTINUE) { + continue; } - } - } - - size = 0; - for (i = 1; i < nargs; i++) { - num = njs_number(&args[i]); - if (isnan(num)) { - goto range_error; + cp = NJS_UNICODE_REPLACEMENT; } - code = num; - - if (code != num || code < 0 || code >= 0x110000) { - goto range_error; - } + size += njs_utf8_size(cp); + length++; + } - size += njs_utf8_size(code); + if (cp == NJS_UNICODE_CONTINUE) { + size += njs_utf8_size(NJS_UNICODE_REPLACEMENT); + length++; } - p = njs_string_alloc(vm, &vm->retval, size, nargs - 1); + p = njs_string_alloc(vm, &vm->retval, size, length); if (njs_slow_path(p == NULL)) { return NJS_ERROR; } + njs_utf16_decode_init(&ctx); + for (i = 1; i < nargs; i++) { - p = njs_utf8_encode(p, njs_number(&args[i])); + if (is_point) { + code = njs_number(&args[i]); + + } else { + code = njs_number_to_uint16(njs_number(&args[i])); + } + + start = buf; + len = njs_utf16_encode(code, &start, end); + + start = buf; + cp = njs_utf16_decode(&ctx, (const u_char **) &start, start + len); + + if (cp > NJS_UNICODE_MAX_CODEPOINT) { + if (cp == NJS_UNICODE_CONTINUE && i + 1 != nargs) { + continue; + } + + cp = NJS_UNICODE_REPLACEMENT; + } + + p = njs_utf8_encode(p, cp); } return NJS_OK; @@ -2591,11 +2611,12 @@ static njs_int_t njs_string_prototype_trim(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t mode) { - uint32_t u, trim, length; - njs_int_t ret; - njs_value_t *value; - const u_char *p, *prev, *start, *end; - njs_string_prop_t string; + uint32_t u, trim, length; + njs_int_t ret; + njs_value_t *value; + const u_char *p, *prev, *start, *end; + njs_string_prop_t string; + njs_unicode_decode_t ctx; value = njs_argument(args, 0); ret = njs_string_object_validate(vm, value); @@ -2651,13 +2672,15 @@ njs_string_prototype_trim(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, /* UTF-8 string. */ if (mode & NJS_TRIM_START) { + njs_utf8_decode_init(&ctx); + for ( ;; ) { if (start == end) { goto empty; } p = start; - u = njs_utf8_decode(&start, end); + u = njs_utf8_decode(&ctx, &start, end); if (njs_utf8_is_whitespace(u)) { trim++; @@ -2672,6 +2695,8 @@ njs_string_prototype_trim(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, if (mode & NJS_TRIM_END) { prev = end; + njs_utf8_decode_init(&ctx); + for ( ;; ) { if (start == prev) { goto empty; @@ -2679,7 +2704,7 @@ njs_string_prototype_trim(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, prev = njs_utf8_prev(prev); p = prev; - u = njs_utf8_decode(&p, end); + u = njs_utf8_decode(&ctx, &p, end); if (njs_utf8_is_whitespace(u)) { trim++; @@ -3640,11 +3665,12 @@ njs_string_prototype_replace(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, double njs_string_to_number(const njs_value_t *value, njs_bool_t parse_float) { - double num; - size_t size; - uint32_t u; - njs_bool_t minus; - const u_char *p, *start, *end; + double num; + size_t size; + uint32_t u; + njs_bool_t minus; + const u_char *p, *start, *end; + njs_unicode_decode_t ctx; const size_t infinity = njs_length("Infinity"); @@ -3660,9 +3686,11 @@ njs_string_to_number(const njs_value_t *value, njs_bool_t parse_float) end = p + size; + njs_utf8_decode_init(&ctx); + while (p < end) { start = p; - u = njs_utf8_decode(&p, end); + u = njs_utf8_decode(&ctx, &p, end); if (!njs_utf8_is_whitespace(u)) { p = start; @@ -4179,15 +4207,16 @@ njs_int_t njs_string_encode_uri(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t component) { - u_char byte, *dst; - uint64_t size; - uint32_t cp, cp_low; - njs_int_t ret; - njs_value_t *value; - const u_char *src, *end; - const uint32_t *escape; - njs_string_prop_t string; - u_char encode[4]; + u_char byte, *dst; + uint64_t size; + uint32_t cp, cp_low; + njs_int_t ret; + njs_value_t *value; + const u_char *src, *end; + const uint32_t *escape; + njs_string_prop_t string; + njs_unicode_decode_t ctx; + u_char encode[4]; static const uint32_t escape_uri[] = { 0xffffffff, /* 1111 1111 1111 1111 1111 1111 1111 1111 */ @@ -4257,8 +4286,10 @@ njs_string_encode_uri(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, } else { /* UTF-8 string. */ + njs_utf8_decode_init(&ctx); + while (src < end) { - cp = njs_utf8_decode(&src, end); + cp = njs_utf8_decode(&ctx, &src, end); if (cp < 0x80 && !njs_need_escape(escape, cp)) { size++; @@ -4271,7 +4302,7 @@ njs_string_encode_uri(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, } if (njs_surrogate_leading(cp)) { - cp_low = njs_utf8_decode(&src, end); + cp_low = njs_utf8_decode(&ctx, &src, end); if (njs_slow_path(!njs_surrogate_trailing(cp_low))) { goto uri_error; @@ -4310,11 +4341,13 @@ njs_string_encode_uri(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, /* UTF-8 string. */ + njs_utf8_decode_init(&ctx); + while (src < end) { - cp = njs_utf8_decode(&src, end); + cp = njs_utf8_decode(&ctx, &src, end); if (njs_slow_path(njs_surrogate_leading(cp))) { - cp_low = njs_utf8_decode(&src, end); + cp_low = njs_utf8_decode(&ctx, &src, end); cp = njs_string_surrogate_pair(cp, cp_low); } @@ -4337,11 +4370,14 @@ njs_inline uint32_t njs_string_decode_uri_cp(const int8_t *hex, const u_char **start, const u_char *end, njs_bool_t expect_percent) { - int8_t d0, d1; - uint32_t cp; - const u_char *p; + int8_t d0, d1; + uint32_t cp; + const u_char *p; + njs_unicode_decode_t ctx; - cp = njs_utf8_decode(start, end); + njs_utf8_decode_init(&ctx); + + cp = njs_utf8_decode(&ctx, start, end); if (njs_fast_path(cp != '%')) { return expect_percent ? 0xFFFFFFFF: cp; } @@ -4378,18 +4414,19 @@ njs_int_t njs_string_decode_uri(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t component) { - u_char *dst; - int64_t size, length; - uint32_t cp; - njs_int_t ret; - njs_chb_t chain; - njs_uint_t i, n; - njs_bool_t percent; - njs_value_t *value; - const u_char *src, *p, *end; - const uint32_t *reserve; - njs_string_prop_t string; - u_char encode[4]; + u_char *dst; + int64_t size, length; + uint32_t cp; + njs_int_t ret; + njs_chb_t chain; + njs_uint_t i, n; + njs_bool_t percent; + njs_value_t *value; + const u_char *src, *p, *end; + const uint32_t *reserve; + njs_string_prop_t string; + njs_unicode_decode_t ctx; + u_char encode[4]; static const uint32_t reserve_uri[] = { 0x00000000, /* 0000 0000 0000 0000 0000 0000 0000 0000 */ @@ -4472,6 +4509,8 @@ njs_string_decode_uri(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_chb_init(&chain, vm->mem_pool); + njs_utf8_decode_init(&ctx); + while (src < end) { percent = (src[0] == '%'); cp = njs_string_decode_uri_cp(hex, &src, end, 0); @@ -4529,8 +4568,8 @@ njs_string_decode_uri(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, } p = encode; - cp = njs_utf8_decode(&p, p + n); - if (njs_slow_path(cp == 0xFFFFFFFF)) { + cp = njs_utf8_decode(&ctx, &p, p + n); + if (njs_slow_path(cp > NJS_UNICODE_MAX_CODEPOINT)) { goto uri_error; } diff --git a/src/njs_unicode.h b/src/njs_unicode.h index a2d32143..a45ce682 100644 --- a/src/njs_unicode.h +++ b/src/njs_unicode.h @@ -9,6 +9,7 @@ enum { + NJS_UNICODE_REPLACEMENT = 0xFFFD, NJS_UNICODE_MAX_CODEPOINT = 0x10FFFF, NJS_UNICODE_ERROR = 0x1FFFFF, NJS_UNICODE_CONTINUE = 0x2FFFFF @@ -16,6 +17,9 @@ enum { typedef struct { uint32_t codepoint; + + unsigned need; + u_char lower; u_char upper; } njs_unicode_decode_t; diff --git a/src/njs_utf8.c b/src/njs_utf8.c index 51baa490..de2cabd2 100644 --- a/src/njs_utf8.c +++ b/src/njs_utf8.c @@ -56,211 +56,166 @@ njs_utf8_encode(u_char *p, uint32_t u) } -/* - * njs_utf8_decode() decodes UTF-8 sequences and returns a valid - * character 0x00 - 0x10FFFF, or 0xFFFFFFFF for invalid or overlong - * UTF-8 sequence. - */ - -uint32_t -njs_utf8_decode(const u_char **start, const u_char *end) +njs_inline njs_int_t +njs_utf8_boundary(njs_unicode_decode_t *ctx, const u_char **data, + unsigned *need, u_char lower, u_char upper) { - uint32_t u; + u_char ch; - u = (uint32_t) **start; + ch = **data; - if (u < 0x80) { - (*start)++; - return u; + if (ch < lower || ch > upper) { + return NJS_ERROR; } - return njs_utf8_decode2(start, end); + (*data)++; + (*need)--; + ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F); + + return NJS_OK; } -/* - * njs_utf8_decode2() decodes two and more bytes UTF-8 sequences only - * and returns a valid character 0x80 - 0x10FFFF, OR 0xFFFFFFFF for - * invalid or overlong UTF-8 sequence. - */ +njs_inline void +njs_utf8_boundary_set(njs_unicode_decode_t *ctx, const u_char ch, + u_char first, u_char second, u_char lower, u_char upper) +{ + if (ch == first) { + ctx->lower = lower; + ctx->upper = 0xBF; + + } else if (ch == second) { + ctx->lower = 0x80; + ctx->upper = upper; + } +} + uint32_t -njs_utf8_decode2(const u_char **start, const u_char *end) +njs_utf8_decode(njs_unicode_decode_t *ctx, const u_char **start, + const u_char *end) { u_char c; - size_t n; - uint32_t u, overlong; + unsigned need; + njs_int_t ret; const u_char *p; - p = *start; - u = (uint32_t) *p; - - if (u >= 0xE0) { + if (ctx->need != 0) { + need = ctx->need; + ctx->need = 0; - if (u >= 0xF0) { - - if (njs_slow_path(u > 0xF4)) { - /* - * The maximum valid Unicode character is 0x10FFFF - * which is encoded as 0xF4 0x8F 0xBF 0xBF. - */ - return 0xFFFFFFFF; + if (ctx->lower != 0x00) { + ret = njs_utf8_boundary(ctx, start, &need, ctx->lower, ctx->upper); + if (njs_slow_path(ret != NJS_OK)) { + goto failed; } - u &= 0x07; - overlong = 0x00FFFF; - n = 3; - - } else { - u &= 0x0F; - overlong = 0x07FF; - n = 2; + ctx->lower = 0x00; } - } else if (u >= 0xC2) { - - /* 0x80 is encoded as 0xC2 0x80. */ - - u &= 0x1F; - overlong = 0x007F; - n = 1; - - } else { - /* u <= 0xC2 */ - return 0xFFFFFFFF; + goto decode; } - p++; - - if (njs_fast_path(p + n <= end)) { - - do { - c = *p++; - /* - * The byte must in the 0x80 - 0xBF range. - * Values below 0x80 become >= 0x80. - */ - c = c - 0x80; - - if (njs_slow_path(c > 0x3F)) { - return 0xFFFFFFFF; - } + c = *(*start)++; - u = (u << 6) | c; - n--; + if (c < 0x80) { + return c; - } while (n != 0); - - if (overlong < u && u < 0x110000) { - *start = p; - return u; + } else if (c <= 0xDF) { + if (c < 0xC2) { + return NJS_UNICODE_ERROR; } - } - - return 0xFFFFFFFF; -} + need = 1; + ctx->codepoint = c & 0x1F; -uint32_t -njs_utf8_safe_decode(const u_char **start, const u_char *end) -{ - uint32_t u; + } else if (c < 0xF0) { + need = 2; + ctx->codepoint = c & 0x0F; - u = (uint32_t) **start; + if (*start == end) { + njs_utf8_boundary_set(ctx, c, 0xE0, 0xED, 0xA0, 0x9F); + goto next; + } - if (u < 0x80) { - (*start)++; - return u; - } + ret = NJS_OK; - return njs_utf8_safe_decode2(start, end); -} + if (c == 0xE0) { + ret = njs_utf8_boundary(ctx, start, &need, 0xA0, 0xBF); + } else if (c == 0xED) { + ret = njs_utf8_boundary(ctx, start, &need, 0x80, 0x9F); + } -uint32_t -njs_utf8_safe_decode2(const u_char **start, const u_char *end) -{ - u_char c; - size_t n; - uint32_t u, overlong; - const u_char *p; - - p = *start; - u = (uint32_t) *p; + if (njs_slow_path(ret != NJS_OK)) { + goto failed; + } - if (u >= 0xE0) { + } else if (c < 0xF5) { + need = 3; + ctx->codepoint = c & 0x07; - if (u >= 0xF0) { + if (*start == end) { + njs_utf8_boundary_set(ctx, c, 0xF0, 0xF4, 0x90, 0x8F); + goto next; + } - if (njs_slow_path(u > 0xF4)) { - /* - * The maximum valid Unicode character is 0x10FFFF - * which is encoded as 0xF4 0x8F 0xBF 0xBF. - */ - goto fail_one; - } + ret = NJS_OK; - u &= 0x07; - overlong = 0x00FFFF; - n = 3; + if (c == 0xF0) { + ret = njs_utf8_boundary(ctx, start, &need, 0x90, 0xBF); - } else { - u &= 0x0F; - overlong = 0x07FF; - n = 2; + } else if (c == 0xF4) { + ret = njs_utf8_boundary(ctx, start, &need, 0x80, 0x8F); } - } else if (u >= 0xC2) { - - /* 0x80 is encoded as 0xC2 0x80. */ - - u &= 0x1F; - overlong = 0x007F; - n = 1; + if (njs_slow_path(ret != NJS_OK)) { + goto failed; + } } else { - /* u <= 0xC2 */ - goto fail_one; + return NJS_UNICODE_ERROR; } - p++; +decode: + + for (p = *start; p < end; p++) { + c = *p; - while (p < end && n != 0) { - c = *p++; - /* - * The byte must in the 0x80 - 0xBF range. - * Values below 0x80 become >= 0x80. - */ - c = c - 0x80; + if (c < 0x80 || c > 0xBF) { + *start = p; - if (njs_slow_path(c > 0x3F)) { - *start = --p; - return NJS_UTF8_REPLACEMENT; + goto failed; } - u = (u << 6) | c; - n--; + ctx->codepoint = (ctx->codepoint << 6) | (c & 0x3F); + + if (--need == 0) { + *start = p + 1; + + return ctx->codepoint; + } } *start = p; - if (n == 0 && overlong < u && u < 0x110000) { - return u; - } +next: - return NJS_UTF8_REPLACEMENT; + ctx->need = need; -fail_one: + return NJS_UNICODE_CONTINUE; - (*start)++; +failed: - return NJS_UTF8_REPLACEMENT; -} + ctx->lower = 0x00; + ctx->need = 0; + return NJS_UNICODE_ERROR; +} /* * njs_utf8_casecmp() tests only up to the minimum of given lengths, but - * requires lengths of both strings because otherwise njs_utf8_decode2() + * requires lengths of both strings because otherwise njs_utf8_decode() * may fail due to incomplete sequence. */ @@ -282,7 +237,7 @@ njs_utf8_casecmp(const u_char *start1, const u_char *start2, size_t len1, u2 = njs_utf8_lower_case(&start2, end2); if (njs_slow_path((u1 | u2) == 0xFFFFFFFF)) { - return NJS_UTF8_SORT_INVALID; + return NJS_UNICODE_ERROR; } n = u1 - u2; @@ -299,8 +254,9 @@ njs_utf8_casecmp(const u_char *start1, const u_char *start2, size_t len1, uint32_t njs_utf8_lower_case(const u_char **start, const u_char *end) { - uint32_t u; - const uint32_t *block; + uint32_t u; + const uint32_t *block; + njs_unicode_decode_t ctx; u = (uint32_t) **start; @@ -310,7 +266,9 @@ njs_utf8_lower_case(const u_char **start, const u_char *end) return njs_unicode_lower_case_block_000[u]; } - u = njs_utf8_decode2(start, end); + njs_utf8_decode_init(&ctx); + + u = njs_utf8_decode(&ctx, start, end); if (u <= NJS_UNICODE_MAX_LOWER_CASE) { block = njs_unicode_lower_case_blocks[u / NJS_UNICODE_BLOCK_SIZE]; @@ -327,8 +285,9 @@ njs_utf8_lower_case(const u_char **start, const u_char *end) uint32_t njs_utf8_upper_case(const u_char **start, const u_char *end) { - uint32_t u; - const uint32_t *block; + uint32_t u; + const uint32_t *block; + njs_unicode_decode_t ctx; u = (uint32_t) **start; @@ -338,7 +297,9 @@ njs_utf8_upper_case(const u_char **start, const u_char *end) return njs_unicode_upper_case_block_000[u]; } - u = njs_utf8_decode2(start, end); + njs_utf8_decode_init(&ctx); + + u = njs_utf8_decode(&ctx, start, end); if (u <= NJS_UNICODE_MAX_UPPER_CASE) { block = njs_unicode_upper_case_blocks[u / NJS_UNICODE_BLOCK_SIZE]; @@ -355,15 +316,20 @@ njs_utf8_upper_case(const u_char **start, const u_char *end) ssize_t njs_utf8_length(const u_char *p, size_t len) { - ssize_t length; - const u_char *end; + ssize_t length; + const u_char *end; + njs_unicode_decode_t ctx; length = 0; end = p + len; + njs_utf8_decode_init(&ctx); + while (p < end) { - if (njs_slow_path(njs_utf8_decode(&p, end) == 0xffffffff)) { + if (njs_slow_path(njs_utf8_decode(&ctx, &p, end) + > NJS_UNICODE_MAX_CODEPOINT)) + { return -1; } @@ -377,19 +343,27 @@ njs_utf8_length(const u_char *p, size_t len) ssize_t njs_utf8_safe_length(const u_char *p, size_t len, ssize_t *out_size) { - ssize_t size, length; - uint32_t codepoint; - const u_char *end; + ssize_t size, length; + uint32_t codepoint; + const u_char *end; + njs_unicode_decode_t ctx; size = 0; length = 0; end = p + len; + njs_utf8_decode_init(&ctx); + while (p < end) { - codepoint = njs_utf8_safe_decode(&p, end); + codepoint = njs_utf8_decode(&ctx, &p, end); - size += njs_utf8_size(codepoint); + if (codepoint <= NJS_UNICODE_MAX_CODEPOINT) { + size += njs_utf8_size(codepoint); + + } else { + size += njs_utf8_size(NJS_UNICODE_REPLACEMENT); + } length++; } @@ -405,12 +379,17 @@ njs_utf8_safe_length(const u_char *p, size_t len, ssize_t *out_size) njs_bool_t njs_utf8_is_valid(const u_char *p, size_t len) { - const u_char *end; + const u_char *end; + njs_unicode_decode_t ctx; end = p + len; + njs_utf8_decode_init(&ctx); + while (p < end) { - if (njs_slow_path(njs_utf8_decode(&p, end) == 0xffffffff)) { + if (njs_slow_path(njs_utf8_decode(&ctx, &p, end) + > NJS_UNICODE_MAX_CODEPOINT)) + { return 0; } } diff --git a/src/njs_utf8.h b/src/njs_utf8.h index 303a0b92..f9518cad 100644 --- a/src/njs_utf8.h +++ b/src/njs_utf8.h @@ -8,23 +8,9 @@ #define _NJS_UTF8_H_INCLUDED_ -/* - * Since the maximum valid Unicode character is 0x0010FFFF, the maximum - * difference between Unicode characters is lesser 0x0010FFFF and - * 0x0EEE0EEE can be used as value to indicate UTF-8 encoding error. - */ -#define NJS_UTF8_SORT_INVALID 0x0EEE0EEE - -#define NJS_UTF8_REPLACEMENT 0xFFFD - - NJS_EXPORT u_char *njs_utf8_encode(u_char *p, uint32_t u); -NJS_EXPORT uint32_t njs_utf8_decode(const u_char **start, const u_char *end); -NJS_EXPORT uint32_t njs_utf8_decode2(const u_char **start, const u_char *end); -NJS_EXPORT uint32_t njs_utf8_safe_decode(const u_char **start, - const u_char *end); -NJS_EXPORT uint32_t njs_utf8_safe_decode2(const u_char **start, - const u_char *end); +NJS_EXPORT uint32_t njs_utf8_decode(njs_unicode_decode_t *ctx, + const u_char **data, const u_char *end); NJS_EXPORT njs_int_t njs_utf8_casecmp(const u_char *start1, const u_char *start2, size_t len1, size_t len2); NJS_EXPORT uint32_t njs_utf8_lower_case(const u_char **start, @@ -36,7 +22,6 @@ NJS_EXPORT ssize_t njs_utf8_safe_length(const u_char *p, size_t len, ssize_t *out_size); NJS_EXPORT njs_bool_t njs_utf8_is_valid(const u_char *p, size_t len); - /* * njs_utf8_next() and njs_utf8_prev() expect a valid UTF-8 string. * @@ -114,12 +99,25 @@ njs_utf8_copy(u_char *dst, const u_char **src, const u_char *end) } -#define njs_utf8_size(u) \ - ((u < 0x80) ? 1 : ((u < 0x0800) ? 2 : ((u < 0x10000) ? 3 : 4))) +njs_inline void +njs_utf8_decode_init(njs_unicode_decode_t *ctx) +{ + ctx->need = 0x00; +} + + +njs_inline size_t +njs_utf8_size(uint32_t cp) +{ + return (cp < 0x80) ? 1 : ((cp < 0x0800) ? 2 : ((cp < 0x10000) ? 3 : 4)); +} -#define njs_utf8_size_uint16(u) \ - ((u < 0x80) ? 1 : ((u < 0x0800) ? 2 : 3)) +njs_inline size_t +njs_utf8_size_uint16(uint32_t cp) +{ + return ((cp < 0x80) ? 1 : ((cp < 0x0800) ? 2 : 3)); +} njs_inline njs_bool_t diff --git a/src/test/njs_unit_test.c b/src/test/njs_unit_test.c index 30464981..df944390 100644 --- a/src/test/njs_unit_test.c +++ b/src/test/njs_unit_test.c @@ -7140,9 +7140,36 @@ static njs_unit_test_t njs_test[] = { njs_str("String.fromCharCode(65.14 + 65536)"), njs_str("A") }, + { njs_str("String.fromCharCode(0xD83D, 0xDCA9)"), + njs_str("💩") }, + + { njs_str("String.fromCharCode(0xD83D, 0xDCA9).length"), + njs_str("1") }, + + { njs_str("String.fromCharCode(0xD83D)"), + njs_str("�") }, + + { njs_str("String.fromCharCode(0xD83D).length"), + njs_str("1") }, + + { njs_str("String.fromCharCode(0xD83D) + String.fromCharCode(0xDCA9)"), + njs_str("��") }, + { njs_str("String.fromCodePoint(65 + 65536)"), njs_str("𐁁") }, + { njs_str("String.fromCodePoint(0xD83D, 0xDCA9)"), + njs_str("💩") }, + + { njs_str("String.fromCodePoint(0xD83D, 0xDCA9).length"), + njs_str("1") }, + + { njs_str("String.fromCodePoint(0xD83D)"), + njs_str("�") }, + + { njs_str("String.fromCodePoint(0xD83D).length"), + njs_str("1") }, + { njs_str("String.fromCharCode(2**53 + 10)"), njs_str("\n") }, @@ -7158,7 +7185,10 @@ static njs_unit_test_t njs_test[] = { njs_str("(function() {" " var n;" " for (n = 0; n <= 65536; n++) {" - " if (String.fromCharCode(n).charCodeAt(0) !== n)" + + /* From U+D800 to U+DFFF is surrogate pair. Not valid in UTF-8. */ + + " if ((n < 0xD800 || n > 0xDFFF) && String.fromCharCode(n).charCodeAt(0) !== n)" " return n;" " }" " return -1" @@ -7169,7 +7199,7 @@ static njs_unit_test_t njs_test[] = { njs_str("(function() {" " var n;" " for (n = 0; n <= 1114111; n++) {" - " if (String.fromCodePoint(n).codePointAt(0) !== n)" + " if ((n < 0xD800 || n > 0xDFFF) && String.fromCodePoint(n).codePointAt(0) !== n)" " return n;" " }" " return -1" @@ -8274,16 +8304,8 @@ static njs_unit_test_t njs_test[] = { njs_str("encodeURI('012абв')"), njs_str("012%D0%B0%D0%B1%D0%B2")}, - { njs_str("[" - " String.fromCharCode(0xD800)," - " String.fromCharCode(0xD800) + 'a'," - " String.fromCharCode(0xDC00)," - " String.fromCharCode(0xDC00) + 'a'," - "].every(v=>{try { encodeURI(v)} catch(e) {return e.name == 'URIError'}})"), - njs_str("true")}, - { njs_str("encodeURI(String.fromCharCode(0xD800)+String.fromCharCode(0xDC00))"), - njs_str("%F0%90%80%80")}, + njs_str("%EF%BF%BD%EF%BF%BD")}, { njs_str("encodeURI('~}|{`_^]\\\\[@?>=<;:/.-,+*)(\\\'&%$#\"! ')"), njs_str("~%7D%7C%7B%60_%5E%5D%5C%5B@?%3E=%3C;:/.-,+*)('&%25$#%22!%20")}, diff --git a/src/test/unicode_unit_test.c b/src/test/unicode_unit_test.c index 1331f69b..35416ea1 100644 --- a/src/test/unicode_unit_test.c +++ b/src/test/unicode_unit_test.c @@ -41,15 +41,18 @@ static u_char invalid[] = { static njs_int_t utf8_overlong(u_char *overlong, size_t len) { - u_char *p, utf8[4]; - size_t size; - uint32_t u, d; - njs_uint_t i; - const u_char *pp; + u_char *p, utf8[4]; + size_t size; + uint32_t u, d; + njs_uint_t i; + const u_char *pp; + njs_unicode_decode_t ctx; + + njs_utf8_decode_init(&ctx); pp = overlong; - d = njs_utf8_decode(&pp, overlong + len); + d = njs_utf8_decode(&ctx, &pp, overlong + len); len = pp - overlong; @@ -79,18 +82,19 @@ utf8_overlong(u_char *overlong, size_t len) static njs_int_t utf8_unit_test(njs_uint_t start) { - u_char *p, utf8[4]; - size_t len; - int32_t n; - uint32_t u, d; - njs_uint_t i, k, l, m; - const u_char *pp; + u_char *p, utf8[4]; + size_t len; + int32_t n; + uint32_t u, d; + njs_uint_t i, k, l, m; + const u_char *pp; + njs_unicode_decode_t ctx; njs_printf("utf8 test started\n"); /* Test valid UTF-8. */ - for (u = 0; u < 0x110000; u++) { + for (u = 0; u <= NJS_UNICODE_MAX_CODEPOINT; u++) { p = njs_utf8_encode(utf8, u); @@ -101,7 +105,22 @@ utf8_unit_test(njs_uint_t start) pp = utf8; - d = njs_utf8_decode(&pp, p); + njs_utf8_decode_init(&ctx); + + d = njs_utf8_decode(&ctx, &pp, p); + + /* In UTF-8 not allowed UTF-16 surrogate pair sequences. */ + + if (u >= 0xD800 && u <= 0xDFFF) { + if (d != NJS_UNICODE_ERROR) { + njs_printf("njs_utf8_decode(%05uXD) failed for " + "surrogate pair: %05uxD\n", u, d); + + return NJS_ERROR; + } + + continue; + } if (u != d) { njs_printf("njs_utf8_decode(%05uXD) failed: %05uxD\n", u, d); @@ -121,9 +140,11 @@ utf8_unit_test(njs_uint_t start) pp = utf8; - d = njs_utf8_decode(&pp, utf8 + len); + njs_utf8_decode_init(&ctx); + + d = njs_utf8_decode(&ctx, &pp, utf8 + len); - if (d != 0xFFFFFFFF) { + if (d <= NJS_UNICODE_MAX_CODEPOINT) { u = 0; for (i = 0; i < len; i++) {