if (njs_surrogate_any(utf)) {
if (utf > 0xdbff || p[0] != '\\' || p[1] != 'u') {
- s = njs_utf8_encode(s, NJS_UTF8_REPLACEMENT);
+ s = njs_utf8_encode(s, NJS_UNICODE_REPLACEMENT);
continue;
}
utf = njs_string_surrogate_pair(utf, utf_low);
} else if (njs_surrogate_leading(utf_low)) {
- utf = NJS_UTF8_REPLACEMENT;
- s = njs_utf8_encode(s, NJS_UTF8_REPLACEMENT);
+ utf = NJS_UNICODE_REPLACEMENT;
+ s = njs_utf8_encode(s, NJS_UNICODE_REPLACEMENT);
} else {
utf = utf_low;
- s = njs_utf8_encode(s, NJS_UTF8_REPLACEMENT);
+ s = njs_utf8_encode(s, NJS_UNICODE_REPLACEMENT);
}
}
njs_parser_string_create(njs_vm_t *vm, njs_lexer_token_t *token,
njs_value_t *value)
{
- u_char *dst;
- ssize_t size, length;
- uint32_t cp;
- njs_str_t *src;
- const u_char *p, *end;
+ u_char *dst;
+ ssize_t size, length;
+ uint32_t cp;
+ njs_str_t *src;
+ const u_char *p, *end;
+ njs_unicode_decode_t ctx;
src = &token->text;
p = src->start;
end = src->start + src->length;
+ njs_utf8_decode_init(&ctx);
+
while (p < end) {
- cp = njs_utf8_safe_decode(&p, end);
+ cp = njs_utf8_decode(&ctx, &p, end);
- dst = njs_utf8_encode(dst, cp);
+ if (cp <= NJS_UNICODE_MAX_CODEPOINT) {
+ dst = njs_utf8_encode(dst, cp);
+
+ } else {
+ dst = njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT);
+ }
}
if (length > NJS_STRING_MAP_STRIDE && size != length) {
njs_parser_escape_string_create(njs_parser_t *parser, njs_lexer_token_t *token,
njs_value_t *value)
{
- u_char c, *start, *dst;
- size_t size, length, hex_length;
- uint64_t cp, cp_pair;
- njs_int_t ret;
- njs_str_t *string;
- const u_char *src, *end, *hex_end;
+ u_char c, *start, *dst;
+ size_t size, length, hex_length;
+ uint64_t cp, cp_pair;
+ njs_int_t ret;
+ njs_str_t *string;
+ const u_char *src, *end, *hex_end;
+ njs_unicode_decode_t ctx;
ret = njs_parser_escape_string_calc_length(parser, token, &size, &length);
if (njs_slow_path(ret != NJS_OK)) {
src--;
- cp = njs_utf8_safe_decode2(&src, end);
+ njs_utf8_decode_init(&ctx);
+
+ cp = njs_utf8_decode(&ctx, &src, end);
+ if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+ cp = NJS_UNICODE_REPLACEMENT;
+ }
+
dst = njs_utf8_encode(dst, cp);
continue;
cp = njs_string_surrogate_pair(cp_pair, cp);
} else if (njs_slow_path(njs_surrogate_leading(cp))) {
- cp = NJS_UTF8_REPLACEMENT;
+ cp = NJS_UNICODE_REPLACEMENT;
dst = njs_utf8_encode(dst, (uint32_t) cp);
} else {
- dst = njs_utf8_encode(dst, NJS_UTF8_REPLACEMENT);
+ dst = njs_utf8_encode(dst, NJS_UNICODE_REPLACEMENT);
}
cp_pair = 0;
continue;
}
- cp = NJS_UTF8_REPLACEMENT;
+ cp = NJS_UNICODE_REPLACEMENT;
}
dst = njs_utf8_encode(dst, (uint32_t) cp);
njs_parser_escape_string_calc_length(njs_parser_t *parser,
njs_lexer_token_t *token, size_t *out_size, size_t *out_length)
{
- size_t size, length, hex_length;
- uint64_t cp, cp_pair;
- njs_str_t *string;
- const u_char *ptr, *src, *end, *hex_end;
+ size_t size, length, hex_length;
+ uint64_t cp, cp_pair;
+ njs_str_t *string;
+ const u_char *ptr, *src, *end, *hex_end;
+ njs_unicode_decode_t ctx;
size = 0;
length = 0;
}
if (*src >= 0x80) {
- cp = njs_utf8_safe_decode2(&src, end);
+ njs_utf8_decode_init(&ctx);
+
+ cp = njs_utf8_decode(&ctx, &src, end);
+ if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+ cp = NJS_UNICODE_REPLACEMENT;
+ }
size += njs_utf8_size(cp);
length++;
cp = njs_string_surrogate_pair(cp_pair, cp);
} else if (njs_slow_path(njs_surrogate_leading(cp))) {
- cp = NJS_UTF8_REPLACEMENT;
+ cp = NJS_UNICODE_REPLACEMENT;
size += njs_utf8_size(cp);
length++;
} else {
- size += njs_utf8_size(NJS_UTF8_REPLACEMENT);
+ size += njs_utf8_size(NJS_UNICODE_REPLACEMENT);
length++;
}
continue;
}
- cp = NJS_UTF8_REPLACEMENT;
+ cp = NJS_UNICODE_REPLACEMENT;
}
size += njs_utf8_size(cp);
njs_slice_prop_t *slice, njs_value_t *args, njs_uint_t nargs);
static njs_int_t njs_string_slice_args(njs_vm_t *vm, njs_slice_prop_t *slice,
njs_value_t *args, njs_uint_t nargs);
-static njs_int_t njs_string_from_char_code(njs_vm_t *vm,
- njs_value_t *args, njs_uint_t nargs, njs_index_t unused);
-static njs_int_t njs_string_from_code_point(njs_vm_t *vm, njs_value_t *args,
- njs_uint_t nargs, njs_index_t unused);
+static njs_int_t njs_string_from_char_code(njs_vm_t *vm, njs_value_t *args,
+ njs_uint_t nargs, njs_index_t is_point);
static njs_int_t njs_string_bytes_from(njs_vm_t *vm, njs_value_t *args,
njs_uint_t nargs, njs_index_t unused);
static njs_int_t njs_string_bytes_from_array_like(njs_vm_t *vm,
{
.type = NJS_PROPERTY,
.name = njs_string("fromCharCode"),
- .value = njs_native_function(njs_string_from_char_code, 1),
+ .value = njs_native_function2(njs_string_from_char_code, 1, 0),
.writable = 1,
.configurable = 1,
},
{
.type = NJS_PROPERTY,
.name = njs_string("fromCodePoint"),
- .value = njs_native_function(njs_string_from_code_point, 1),
+ .value = njs_native_function2(njs_string_from_char_code, 1, 1),
.writable = 1,
.configurable = 1,
},
njs_string_prototype_to_bytes(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
njs_index_t unused)
{
- u_char *p;
- size_t length;
- uint32_t byte;
- njs_int_t ret;
- const u_char *s, *end;
- njs_slice_prop_t slice;
- njs_string_prop_t string;
+ u_char *p;
+ size_t length;
+ uint32_t byte;
+ njs_int_t ret;
+ const u_char *s, *end;
+ njs_slice_prop_t slice;
+ njs_string_prop_t string;
+ njs_unicode_decode_t ctx;
ret = njs_string_object_validate(vm, njs_arg(args, nargs, 0));
if (njs_slow_path(ret != NJS_OK)) {
length = slice.length;
+ njs_utf8_decode_init(&ctx);
+
while (length != 0 && s < end) {
- byte = njs_utf8_decode(&s, end);
+ byte = njs_utf8_decode(&ctx, &s, end);
if (njs_slow_path(byte > 0xFF)) {
njs_release(vm, &vm->retval);
njs_string_prototype_char_code_at(njs_vm_t *vm, njs_value_t *args,
njs_uint_t nargs, njs_index_t unused)
{
- double num;
- size_t length;
- int64_t index;
- uint32_t code;
- njs_int_t ret;
- const u_char *start, *end;
- njs_string_prop_t string;
+ double num;
+ size_t length;
+ int64_t index;
+ uint32_t code;
+ njs_int_t ret;
+ const u_char *start, *end;
+ njs_string_prop_t string;
+ njs_unicode_decode_t ctx;
ret = njs_string_object_validate(vm, njs_arg(args, nargs, 0));
if (njs_slow_path(ret != NJS_OK)) {
code = string.start[index];
} else {
+ njs_utf8_decode_init(&ctx);
+
/* UTF-8 string. */
end = string.start + string.size;
start = njs_string_offset(string.start, end, index);
- code = njs_utf8_decode(&start, end);
+ code = njs_utf8_decode(&ctx, &start, end);
}
num = code;
static njs_int_t
-njs_string_from_char_code(njs_vm_t *vm, njs_value_t *args,
- njs_uint_t nargs, njs_index_t unused)
+njs_string_from_char_code(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
+ njs_index_t is_point)
{
- u_char *p;
- size_t size;
- uint16_t code;
- njs_int_t ret;
- njs_uint_t i;
+ double num;
+ u_char *p, *start, *end;
+ ssize_t len;
+ int32_t code;
+ uint32_t cp;
+ uint64_t length, size;
+ njs_int_t ret;
+ njs_uint_t i;
+ njs_unicode_decode_t ctx;
+ u_char buf[4];
+
+ size = 0;
+ length = 0;
+
+ cp = 0x00;
+ end = buf + sizeof(buf);
+
+ njs_utf16_decode_init(&ctx);
for (i = 1; i < nargs; i++) {
if (!njs_is_numeric(&args[i])) {
return ret;
}
}
- }
- size = 0;
+ if (is_point) {
+ num = njs_number(&args[i]);
+ if (isnan(num)) {
+ goto range_error;
+ }
- for (i = 1; i < nargs; i++) {
- code = njs_number_to_uint16(njs_number(&args[i]));
- size += njs_utf8_size_uint16(code);
- }
+ code = num;
- p = njs_string_alloc(vm, &vm->retval, size, nargs - 1);
- if (njs_slow_path(p == NULL)) {
- return NJS_ERROR;
- }
-
- for (i = 1; i < nargs; i++) {
- code = njs_number_to_uint16(njs_number(&args[i]));
- p = njs_utf8_encode(p, code);
- }
+ if (code != num || code < 0 || code > 0x10FFFF) {
+ goto range_error;
+ }
- return NJS_OK;
-}
+ } else {
+ code = njs_number_to_uint16(njs_number(&args[i]));
+ }
+ start = buf;
+ len = njs_utf16_encode(code, &start, end);
-static njs_int_t
-njs_string_from_code_point(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
- njs_index_t unused)
-{
- u_char *p;
- double num;
- size_t size;
- int32_t code;
- njs_int_t ret;
- njs_uint_t i;
+ start = buf;
+ cp = njs_utf16_decode(&ctx, (const u_char **) &start, start + len);
- for (i = 1; i < nargs; i++) {
- if (!njs_is_numeric(&args[i])) {
- ret = njs_value_to_numeric(vm, &args[i], &args[i]);
- if (ret != NJS_OK) {
- return ret;
+ if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+ if (cp == NJS_UNICODE_CONTINUE) {
+ continue;
}
- }
- }
-
- size = 0;
- for (i = 1; i < nargs; i++) {
- num = njs_number(&args[i]);
- if (isnan(num)) {
- goto range_error;
+ cp = NJS_UNICODE_REPLACEMENT;
}
- code = num;
-
- if (code != num || code < 0 || code >= 0x110000) {
- goto range_error;
- }
+ size += njs_utf8_size(cp);
+ length++;
+ }
- size += njs_utf8_size(code);
+ if (cp == NJS_UNICODE_CONTINUE) {
+ size += njs_utf8_size(NJS_UNICODE_REPLACEMENT);
+ length++;
}
- p = njs_string_alloc(vm, &vm->retval, size, nargs - 1);
+ p = njs_string_alloc(vm, &vm->retval, size, length);
if (njs_slow_path(p == NULL)) {
return NJS_ERROR;
}
+ njs_utf16_decode_init(&ctx);
+
for (i = 1; i < nargs; i++) {
- p = njs_utf8_encode(p, njs_number(&args[i]));
+ if (is_point) {
+ code = njs_number(&args[i]);
+
+ } else {
+ code = njs_number_to_uint16(njs_number(&args[i]));
+ }
+
+ start = buf;
+ len = njs_utf16_encode(code, &start, end);
+
+ start = buf;
+ cp = njs_utf16_decode(&ctx, (const u_char **) &start, start + len);
+
+ if (cp > NJS_UNICODE_MAX_CODEPOINT) {
+ if (cp == NJS_UNICODE_CONTINUE && i + 1 != nargs) {
+ continue;
+ }
+
+ cp = NJS_UNICODE_REPLACEMENT;
+ }
+
+ p = njs_utf8_encode(p, cp);
}
return NJS_OK;
njs_string_prototype_trim(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
njs_index_t mode)
{
- uint32_t u, trim, length;
- njs_int_t ret;
- njs_value_t *value;
- const u_char *p, *prev, *start, *end;
- njs_string_prop_t string;
+ uint32_t u, trim, length;
+ njs_int_t ret;
+ njs_value_t *value;
+ const u_char *p, *prev, *start, *end;
+ njs_string_prop_t string;
+ njs_unicode_decode_t ctx;
value = njs_argument(args, 0);
ret = njs_string_object_validate(vm, value);
/* UTF-8 string. */
if (mode & NJS_TRIM_START) {
+ njs_utf8_decode_init(&ctx);
+
for ( ;; ) {
if (start == end) {
goto empty;
}
p = start;
- u = njs_utf8_decode(&start, end);
+ u = njs_utf8_decode(&ctx, &start, end);
if (njs_utf8_is_whitespace(u)) {
trim++;
if (mode & NJS_TRIM_END) {
prev = end;
+ njs_utf8_decode_init(&ctx);
+
for ( ;; ) {
if (start == prev) {
goto empty;
prev = njs_utf8_prev(prev);
p = prev;
- u = njs_utf8_decode(&p, end);
+ u = njs_utf8_decode(&ctx, &p, end);
if (njs_utf8_is_whitespace(u)) {
trim++;
double
njs_string_to_number(const njs_value_t *value, njs_bool_t parse_float)
{
- double num;
- size_t size;
- uint32_t u;
- njs_bool_t minus;
- const u_char *p, *start, *end;
+ double num;
+ size_t size;
+ uint32_t u;
+ njs_bool_t minus;
+ const u_char *p, *start, *end;
+ njs_unicode_decode_t ctx;
const size_t infinity = njs_length("Infinity");
end = p + size;
+ njs_utf8_decode_init(&ctx);
+
while (p < end) {
start = p;
- u = njs_utf8_decode(&p, end);
+ u = njs_utf8_decode(&ctx, &p, end);
if (!njs_utf8_is_whitespace(u)) {
p = start;
njs_string_encode_uri(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
njs_index_t component)
{
- u_char byte, *dst;
- uint64_t size;
- uint32_t cp, cp_low;
- njs_int_t ret;
- njs_value_t *value;
- const u_char *src, *end;
- const uint32_t *escape;
- njs_string_prop_t string;
- u_char encode[4];
+ u_char byte, *dst;
+ uint64_t size;
+ uint32_t cp, cp_low;
+ njs_int_t ret;
+ njs_value_t *value;
+ const u_char *src, *end;
+ const uint32_t *escape;
+ njs_string_prop_t string;
+ njs_unicode_decode_t ctx;
+ u_char encode[4];
static const uint32_t escape_uri[] = {
0xffffffff, /* 1111 1111 1111 1111 1111 1111 1111 1111 */
} else {
/* UTF-8 string. */
+ njs_utf8_decode_init(&ctx);
+
while (src < end) {
- cp = njs_utf8_decode(&src, end);
+ cp = njs_utf8_decode(&ctx, &src, end);
if (cp < 0x80 && !njs_need_escape(escape, cp)) {
size++;
}
if (njs_surrogate_leading(cp)) {
- cp_low = njs_utf8_decode(&src, end);
+ cp_low = njs_utf8_decode(&ctx, &src, end);
if (njs_slow_path(!njs_surrogate_trailing(cp_low))) {
goto uri_error;
/* UTF-8 string. */
+ njs_utf8_decode_init(&ctx);
+
while (src < end) {
- cp = njs_utf8_decode(&src, end);
+ cp = njs_utf8_decode(&ctx, &src, end);
if (njs_slow_path(njs_surrogate_leading(cp))) {
- cp_low = njs_utf8_decode(&src, end);
+ cp_low = njs_utf8_decode(&ctx, &src, end);
cp = njs_string_surrogate_pair(cp, cp_low);
}
njs_string_decode_uri_cp(const int8_t *hex, const u_char **start,
const u_char *end, njs_bool_t expect_percent)
{
- int8_t d0, d1;
- uint32_t cp;
- const u_char *p;
+ int8_t d0, d1;
+ uint32_t cp;
+ const u_char *p;
+ njs_unicode_decode_t ctx;
- cp = njs_utf8_decode(start, end);
+ njs_utf8_decode_init(&ctx);
+
+ cp = njs_utf8_decode(&ctx, start, end);
if (njs_fast_path(cp != '%')) {
return expect_percent ? 0xFFFFFFFF: cp;
}
njs_string_decode_uri(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs,
njs_index_t component)
{
- u_char *dst;
- int64_t size, length;
- uint32_t cp;
- njs_int_t ret;
- njs_chb_t chain;
- njs_uint_t i, n;
- njs_bool_t percent;
- njs_value_t *value;
- const u_char *src, *p, *end;
- const uint32_t *reserve;
- njs_string_prop_t string;
- u_char encode[4];
+ u_char *dst;
+ int64_t size, length;
+ uint32_t cp;
+ njs_int_t ret;
+ njs_chb_t chain;
+ njs_uint_t i, n;
+ njs_bool_t percent;
+ njs_value_t *value;
+ const u_char *src, *p, *end;
+ const uint32_t *reserve;
+ njs_string_prop_t string;
+ njs_unicode_decode_t ctx;
+ u_char encode[4];
static const uint32_t reserve_uri[] = {
0x00000000, /* 0000 0000 0000 0000 0000 0000 0000 0000 */
njs_chb_init(&chain, vm->mem_pool);
+ njs_utf8_decode_init(&ctx);
+
while (src < end) {
percent = (src[0] == '%');
cp = njs_string_decode_uri_cp(hex, &src, end, 0);
}
p = encode;
- cp = njs_utf8_decode(&p, p + n);
- if (njs_slow_path(cp == 0xFFFFFFFF)) {
+ cp = njs_utf8_decode(&ctx, &p, p + n);
+ if (njs_slow_path(cp > NJS_UNICODE_MAX_CODEPOINT)) {
goto uri_error;
}
enum {
+ NJS_UNICODE_REPLACEMENT = 0xFFFD,
NJS_UNICODE_MAX_CODEPOINT = 0x10FFFF,
NJS_UNICODE_ERROR = 0x1FFFFF,
NJS_UNICODE_CONTINUE = 0x2FFFFF
typedef struct {
uint32_t codepoint;
+
+ unsigned need;
+ u_char lower;
u_char upper;
} njs_unicode_decode_t;
}
-/*
- * njs_utf8_decode() decodes UTF-8 sequences and returns a valid
- * character 0x00 - 0x10FFFF, or 0xFFFFFFFF for invalid or overlong
- * UTF-8 sequence.
- */
-
-uint32_t
-njs_utf8_decode(const u_char **start, const u_char *end)
+njs_inline njs_int_t
+njs_utf8_boundary(njs_unicode_decode_t *ctx, const u_char **data,
+ unsigned *need, u_char lower, u_char upper)
{
- uint32_t u;
+ u_char ch;
- u = (uint32_t) **start;
+ ch = **data;
- if (u < 0x80) {
- (*start)++;
- return u;
+ if (ch < lower || ch > upper) {
+ return NJS_ERROR;
}
- return njs_utf8_decode2(start, end);
+ (*data)++;
+ (*need)--;
+ ctx->codepoint = (ctx->codepoint << 6) | (ch & 0x3F);
+
+ return NJS_OK;
}
-/*
- * njs_utf8_decode2() decodes two and more bytes UTF-8 sequences only
- * and returns a valid character 0x80 - 0x10FFFF, OR 0xFFFFFFFF for
- * invalid or overlong UTF-8 sequence.
- */
+njs_inline void
+njs_utf8_boundary_set(njs_unicode_decode_t *ctx, const u_char ch,
+ u_char first, u_char second, u_char lower, u_char upper)
+{
+ if (ch == first) {
+ ctx->lower = lower;
+ ctx->upper = 0xBF;
+
+ } else if (ch == second) {
+ ctx->lower = 0x80;
+ ctx->upper = upper;
+ }
+}
+
uint32_t
-njs_utf8_decode2(const u_char **start, const u_char *end)
+njs_utf8_decode(njs_unicode_decode_t *ctx, const u_char **start,
+ const u_char *end)
{
u_char c;
- size_t n;
- uint32_t u, overlong;
+ unsigned need;
+ njs_int_t ret;
const u_char *p;
- p = *start;
- u = (uint32_t) *p;
-
- if (u >= 0xE0) {
+ if (ctx->need != 0) {
+ need = ctx->need;
+ ctx->need = 0;
- if (u >= 0xF0) {
-
- if (njs_slow_path(u > 0xF4)) {
- /*
- * The maximum valid Unicode character is 0x10FFFF
- * which is encoded as 0xF4 0x8F 0xBF 0xBF.
- */
- return 0xFFFFFFFF;
+ if (ctx->lower != 0x00) {
+ ret = njs_utf8_boundary(ctx, start, &need, ctx->lower, ctx->upper);
+ if (njs_slow_path(ret != NJS_OK)) {
+ goto failed;
}
- u &= 0x07;
- overlong = 0x00FFFF;
- n = 3;
-
- } else {
- u &= 0x0F;
- overlong = 0x07FF;
- n = 2;
+ ctx->lower = 0x00;
}
- } else if (u >= 0xC2) {
-
- /* 0x80 is encoded as 0xC2 0x80. */
-
- u &= 0x1F;
- overlong = 0x007F;
- n = 1;
-
- } else {
- /* u <= 0xC2 */
- return 0xFFFFFFFF;
+ goto decode;
}
- p++;
-
- if (njs_fast_path(p + n <= end)) {
-
- do {
- c = *p++;
- /*
- * The byte must in the 0x80 - 0xBF range.
- * Values below 0x80 become >= 0x80.
- */
- c = c - 0x80;
-
- if (njs_slow_path(c > 0x3F)) {
- return 0xFFFFFFFF;
- }
+ c = *(*start)++;
- u = (u << 6) | c;
- n--;
+ if (c < 0x80) {
+ return c;
- } while (n != 0);
-
- if (overlong < u && u < 0x110000) {
- *start = p;
- return u;
+ } else if (c <= 0xDF) {
+ if (c < 0xC2) {
+ return NJS_UNICODE_ERROR;
}
- }
-
- return 0xFFFFFFFF;
-}
+ need = 1;
+ ctx->codepoint = c & 0x1F;
-uint32_t
-njs_utf8_safe_decode(const u_char **start, const u_char *end)
-{
- uint32_t u;
+ } else if (c < 0xF0) {
+ need = 2;
+ ctx->codepoint = c & 0x0F;
- u = (uint32_t) **start;
+ if (*start == end) {
+ njs_utf8_boundary_set(ctx, c, 0xE0, 0xED, 0xA0, 0x9F);
+ goto next;
+ }
- if (u < 0x80) {
- (*start)++;
- return u;
- }
+ ret = NJS_OK;
- return njs_utf8_safe_decode2(start, end);
-}
+ if (c == 0xE0) {
+ ret = njs_utf8_boundary(ctx, start, &need, 0xA0, 0xBF);
+ } else if (c == 0xED) {
+ ret = njs_utf8_boundary(ctx, start, &need, 0x80, 0x9F);
+ }
-uint32_t
-njs_utf8_safe_decode2(const u_char **start, const u_char *end)
-{
- u_char c;
- size_t n;
- uint32_t u, overlong;
- const u_char *p;
-
- p = *start;
- u = (uint32_t) *p;
+ if (njs_slow_path(ret != NJS_OK)) {
+ goto failed;
+ }
- if (u >= 0xE0) {
+ } else if (c < 0xF5) {
+ need = 3;
+ ctx->codepoint = c & 0x07;
- if (u >= 0xF0) {
+ if (*start == end) {
+ njs_utf8_boundary_set(ctx, c, 0xF0, 0xF4, 0x90, 0x8F);
+ goto next;
+ }
- if (njs_slow_path(u > 0xF4)) {
- /*
- * The maximum valid Unicode character is 0x10FFFF
- * which is encoded as 0xF4 0x8F 0xBF 0xBF.
- */
- goto fail_one;
- }
+ ret = NJS_OK;
- u &= 0x07;
- overlong = 0x00FFFF;
- n = 3;
+ if (c == 0xF0) {
+ ret = njs_utf8_boundary(ctx, start, &need, 0x90, 0xBF);
- } else {
- u &= 0x0F;
- overlong = 0x07FF;
- n = 2;
+ } else if (c == 0xF4) {
+ ret = njs_utf8_boundary(ctx, start, &need, 0x80, 0x8F);
}
- } else if (u >= 0xC2) {
-
- /* 0x80 is encoded as 0xC2 0x80. */
-
- u &= 0x1F;
- overlong = 0x007F;
- n = 1;
+ if (njs_slow_path(ret != NJS_OK)) {
+ goto failed;
+ }
} else {
- /* u <= 0xC2 */
- goto fail_one;
+ return NJS_UNICODE_ERROR;
}
- p++;
+decode:
+
+ for (p = *start; p < end; p++) {
+ c = *p;
- while (p < end && n != 0) {
- c = *p++;
- /*
- * The byte must in the 0x80 - 0xBF range.
- * Values below 0x80 become >= 0x80.
- */
- c = c - 0x80;
+ if (c < 0x80 || c > 0xBF) {
+ *start = p;
- if (njs_slow_path(c > 0x3F)) {
- *start = --p;
- return NJS_UTF8_REPLACEMENT;
+ goto failed;
}
- u = (u << 6) | c;
- n--;
+ ctx->codepoint = (ctx->codepoint << 6) | (c & 0x3F);
+
+ if (--need == 0) {
+ *start = p + 1;
+
+ return ctx->codepoint;
+ }
}
*start = p;
- if (n == 0 && overlong < u && u < 0x110000) {
- return u;
- }
+next:
- return NJS_UTF8_REPLACEMENT;
+ ctx->need = need;
-fail_one:
+ return NJS_UNICODE_CONTINUE;
- (*start)++;
+failed:
- return NJS_UTF8_REPLACEMENT;
-}
+ ctx->lower = 0x00;
+ ctx->need = 0;
+ return NJS_UNICODE_ERROR;
+}
/*
* njs_utf8_casecmp() tests only up to the minimum of given lengths, but
- * requires lengths of both strings because otherwise njs_utf8_decode2()
+ * requires lengths of both strings because otherwise njs_utf8_decode()
* may fail due to incomplete sequence.
*/
u2 = njs_utf8_lower_case(&start2, end2);
if (njs_slow_path((u1 | u2) == 0xFFFFFFFF)) {
- return NJS_UTF8_SORT_INVALID;
+ return NJS_UNICODE_ERROR;
}
n = u1 - u2;
uint32_t
njs_utf8_lower_case(const u_char **start, const u_char *end)
{
- uint32_t u;
- const uint32_t *block;
+ uint32_t u;
+ const uint32_t *block;
+ njs_unicode_decode_t ctx;
u = (uint32_t) **start;
return njs_unicode_lower_case_block_000[u];
}
- u = njs_utf8_decode2(start, end);
+ njs_utf8_decode_init(&ctx);
+
+ u = njs_utf8_decode(&ctx, start, end);
if (u <= NJS_UNICODE_MAX_LOWER_CASE) {
block = njs_unicode_lower_case_blocks[u / NJS_UNICODE_BLOCK_SIZE];
uint32_t
njs_utf8_upper_case(const u_char **start, const u_char *end)
{
- uint32_t u;
- const uint32_t *block;
+ uint32_t u;
+ const uint32_t *block;
+ njs_unicode_decode_t ctx;
u = (uint32_t) **start;
return njs_unicode_upper_case_block_000[u];
}
- u = njs_utf8_decode2(start, end);
+ njs_utf8_decode_init(&ctx);
+
+ u = njs_utf8_decode(&ctx, start, end);
if (u <= NJS_UNICODE_MAX_UPPER_CASE) {
block = njs_unicode_upper_case_blocks[u / NJS_UNICODE_BLOCK_SIZE];
ssize_t
njs_utf8_length(const u_char *p, size_t len)
{
- ssize_t length;
- const u_char *end;
+ ssize_t length;
+ const u_char *end;
+ njs_unicode_decode_t ctx;
length = 0;
end = p + len;
+ njs_utf8_decode_init(&ctx);
+
while (p < end) {
- if (njs_slow_path(njs_utf8_decode(&p, end) == 0xffffffff)) {
+ if (njs_slow_path(njs_utf8_decode(&ctx, &p, end)
+ > NJS_UNICODE_MAX_CODEPOINT))
+ {
return -1;
}
ssize_t
njs_utf8_safe_length(const u_char *p, size_t len, ssize_t *out_size)
{
- ssize_t size, length;
- uint32_t codepoint;
- const u_char *end;
+ ssize_t size, length;
+ uint32_t codepoint;
+ const u_char *end;
+ njs_unicode_decode_t ctx;
size = 0;
length = 0;
end = p + len;
+ njs_utf8_decode_init(&ctx);
+
while (p < end) {
- codepoint = njs_utf8_safe_decode(&p, end);
+ codepoint = njs_utf8_decode(&ctx, &p, end);
- size += njs_utf8_size(codepoint);
+ if (codepoint <= NJS_UNICODE_MAX_CODEPOINT) {
+ size += njs_utf8_size(codepoint);
+
+ } else {
+ size += njs_utf8_size(NJS_UNICODE_REPLACEMENT);
+ }
length++;
}
njs_bool_t
njs_utf8_is_valid(const u_char *p, size_t len)
{
- const u_char *end;
+ const u_char *end;
+ njs_unicode_decode_t ctx;
end = p + len;
+ njs_utf8_decode_init(&ctx);
+
while (p < end) {
- if (njs_slow_path(njs_utf8_decode(&p, end) == 0xffffffff)) {
+ if (njs_slow_path(njs_utf8_decode(&ctx, &p, end)
+ > NJS_UNICODE_MAX_CODEPOINT))
+ {
return 0;
}
}
#define _NJS_UTF8_H_INCLUDED_
-/*
- * Since the maximum valid Unicode character is 0x0010FFFF, the maximum
- * difference between Unicode characters is lesser 0x0010FFFF and
- * 0x0EEE0EEE can be used as value to indicate UTF-8 encoding error.
- */
-#define NJS_UTF8_SORT_INVALID 0x0EEE0EEE
-
-#define NJS_UTF8_REPLACEMENT 0xFFFD
-
-
NJS_EXPORT u_char *njs_utf8_encode(u_char *p, uint32_t u);
-NJS_EXPORT uint32_t njs_utf8_decode(const u_char **start, const u_char *end);
-NJS_EXPORT uint32_t njs_utf8_decode2(const u_char **start, const u_char *end);
-NJS_EXPORT uint32_t njs_utf8_safe_decode(const u_char **start,
- const u_char *end);
-NJS_EXPORT uint32_t njs_utf8_safe_decode2(const u_char **start,
- const u_char *end);
+NJS_EXPORT uint32_t njs_utf8_decode(njs_unicode_decode_t *ctx,
+ const u_char **data, const u_char *end);
NJS_EXPORT njs_int_t njs_utf8_casecmp(const u_char *start1,
const u_char *start2, size_t len1, size_t len2);
NJS_EXPORT uint32_t njs_utf8_lower_case(const u_char **start,
ssize_t *out_size);
NJS_EXPORT njs_bool_t njs_utf8_is_valid(const u_char *p, size_t len);
-
/*
* njs_utf8_next() and njs_utf8_prev() expect a valid UTF-8 string.
*
}
-#define njs_utf8_size(u) \
- ((u < 0x80) ? 1 : ((u < 0x0800) ? 2 : ((u < 0x10000) ? 3 : 4)))
+njs_inline void
+njs_utf8_decode_init(njs_unicode_decode_t *ctx)
+{
+ ctx->need = 0x00;
+}
+
+
+njs_inline size_t
+njs_utf8_size(uint32_t cp)
+{
+ return (cp < 0x80) ? 1 : ((cp < 0x0800) ? 2 : ((cp < 0x10000) ? 3 : 4));
+}
-#define njs_utf8_size_uint16(u) \
- ((u < 0x80) ? 1 : ((u < 0x0800) ? 2 : 3))
+njs_inline size_t
+njs_utf8_size_uint16(uint32_t cp)
+{
+ return ((cp < 0x80) ? 1 : ((cp < 0x0800) ? 2 : 3));
+}
njs_inline njs_bool_t
{ njs_str("String.fromCharCode(65.14 + 65536)"),
njs_str("A") },
+ { njs_str("String.fromCharCode(0xD83D, 0xDCA9)"),
+ njs_str("💩") },
+
+ { njs_str("String.fromCharCode(0xD83D, 0xDCA9).length"),
+ njs_str("1") },
+
+ { njs_str("String.fromCharCode(0xD83D)"),
+ njs_str("�") },
+
+ { njs_str("String.fromCharCode(0xD83D).length"),
+ njs_str("1") },
+
+ { njs_str("String.fromCharCode(0xD83D) + String.fromCharCode(0xDCA9)"),
+ njs_str("��") },
+
{ njs_str("String.fromCodePoint(65 + 65536)"),
njs_str("𐁁") },
+ { njs_str("String.fromCodePoint(0xD83D, 0xDCA9)"),
+ njs_str("💩") },
+
+ { njs_str("String.fromCodePoint(0xD83D, 0xDCA9).length"),
+ njs_str("1") },
+
+ { njs_str("String.fromCodePoint(0xD83D)"),
+ njs_str("�") },
+
+ { njs_str("String.fromCodePoint(0xD83D).length"),
+ njs_str("1") },
+
{ njs_str("String.fromCharCode(2**53 + 10)"),
njs_str("\n") },
{ njs_str("(function() {"
" var n;"
" for (n = 0; n <= 65536; n++) {"
- " if (String.fromCharCode(n).charCodeAt(0) !== n)"
+
+ /* From U+D800 to U+DFFF is surrogate pair. Not valid in UTF-8. */
+
+ " if ((n < 0xD800 || n > 0xDFFF) && String.fromCharCode(n).charCodeAt(0) !== n)"
" return n;"
" }"
" return -1"
{ njs_str("(function() {"
" var n;"
" for (n = 0; n <= 1114111; n++) {"
- " if (String.fromCodePoint(n).codePointAt(0) !== n)"
+ " if ((n < 0xD800 || n > 0xDFFF) && String.fromCodePoint(n).codePointAt(0) !== n)"
" return n;"
" }"
" return -1"
{ njs_str("encodeURI('012абв')"),
njs_str("012%D0%B0%D0%B1%D0%B2")},
- { njs_str("["
- " String.fromCharCode(0xD800),"
- " String.fromCharCode(0xD800) + 'a',"
- " String.fromCharCode(0xDC00),"
- " String.fromCharCode(0xDC00) + 'a',"
- "].every(v=>{try { encodeURI(v)} catch(e) {return e.name == 'URIError'}})"),
- njs_str("true")},
-
{ njs_str("encodeURI(String.fromCharCode(0xD800)+String.fromCharCode(0xDC00))"),
- njs_str("%F0%90%80%80")},
+ njs_str("%EF%BF%BD%EF%BF%BD")},
{ njs_str("encodeURI('~}|{`_^]\\\\[@?>=<;:/.-,+*)(\\\'&%$#\"! ')"),
njs_str("~%7D%7C%7B%60_%5E%5D%5C%5B@?%3E=%3C;:/.-,+*)('&%25$#%22!%20")},
static njs_int_t
utf8_overlong(u_char *overlong, size_t len)
{
- u_char *p, utf8[4];
- size_t size;
- uint32_t u, d;
- njs_uint_t i;
- const u_char *pp;
+ u_char *p, utf8[4];
+ size_t size;
+ uint32_t u, d;
+ njs_uint_t i;
+ const u_char *pp;
+ njs_unicode_decode_t ctx;
+
+ njs_utf8_decode_init(&ctx);
pp = overlong;
- d = njs_utf8_decode(&pp, overlong + len);
+ d = njs_utf8_decode(&ctx, &pp, overlong + len);
len = pp - overlong;
static njs_int_t
utf8_unit_test(njs_uint_t start)
{
- u_char *p, utf8[4];
- size_t len;
- int32_t n;
- uint32_t u, d;
- njs_uint_t i, k, l, m;
- const u_char *pp;
+ u_char *p, utf8[4];
+ size_t len;
+ int32_t n;
+ uint32_t u, d;
+ njs_uint_t i, k, l, m;
+ const u_char *pp;
+ njs_unicode_decode_t ctx;
njs_printf("utf8 test started\n");
/* Test valid UTF-8. */
- for (u = 0; u < 0x110000; u++) {
+ for (u = 0; u <= NJS_UNICODE_MAX_CODEPOINT; u++) {
p = njs_utf8_encode(utf8, u);
pp = utf8;
- d = njs_utf8_decode(&pp, p);
+ njs_utf8_decode_init(&ctx);
+
+ d = njs_utf8_decode(&ctx, &pp, p);
+
+ /* In UTF-8 not allowed UTF-16 surrogate pair sequences. */
+
+ if (u >= 0xD800 && u <= 0xDFFF) {
+ if (d != NJS_UNICODE_ERROR) {
+ njs_printf("njs_utf8_decode(%05uXD) failed for "
+ "surrogate pair: %05uxD\n", u, d);
+
+ return NJS_ERROR;
+ }
+
+ continue;
+ }
if (u != d) {
njs_printf("njs_utf8_decode(%05uXD) failed: %05uxD\n", u, d);
pp = utf8;
- d = njs_utf8_decode(&pp, utf8 + len);
+ njs_utf8_decode_init(&ctx);
+
+ d = njs_utf8_decode(&ctx, &pp, utf8 + len);
- if (d != 0xFFFFFFFF) {
+ if (d <= NJS_UNICODE_MAX_CODEPOINT) {
u = 0;
for (i = 0; i < len; i++) {