#define RE_HEADER_FLAGS 0
#define RE_HEADER_CAPTURE_COUNT 1
#define RE_HEADER_STACK_SIZE 2
+#define RE_HEADER_BYTECODE_LEN 3
#define RE_HEADER_LEN 7
assert(buf_len >= RE_HEADER_LEN);
- re_flags = buf[0];
- bc_len = get_u32(buf + 3);
+ re_flags = lre_get_flags(buf);
+ bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN);
assert(bc_len + RE_HEADER_LEN <= buf_len);
printf("flags: 0x%x capture_count=%d stack_size=%d\n",
- re_flags, buf[1], buf[2]);
+ re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_STACK_SIZE]);
if (re_flags & LRE_FLAG_NAMED_GROUPS) {
const char *p;
p = (char *)buf + RE_HEADER_LEN + bc_len;
printf("named groups: ");
- for(i = 1; i < buf[1]; i++) {
+ for(i = 1; i < buf[RE_HEADER_CAPTURE_COUNT]; i++) {
if (i != 1)
printf(",");
printf("<%s>", p);
}
c = (c << 4) | h;
}
- if (c >= 0xd800 && c < 0xdc00 &&
+ if (is_hi_surrogate(c) &&
allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
/* convert an escaped surrogate pair into a
unicode char */
break;
c1 = (c1 << 4) | h;
}
- if (i == 4 && c1 >= 0xdc00 && c1 < 0xe000) {
+ if (i == 4 && is_lo_surrogate(c1)) {
p += 6;
- c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
+ c = from_surrogate(c, c1);
}
}
}
case REOP_backward_back_reference:
break;
default:
- /* safe behvior: we cannot predict the outcome */
+ /* safe behavior: we cannot predict the outcome */
return TRUE;
}
pos += len;
break;
} else if (c >= 128) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
- if (c >= 0xD800 && c <= 0xDBFF) {
+ if (is_hi_surrogate(c)) {
d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
- if (d >= 0xDC00 && d <= 0xDFFF) {
- c = 0x10000 + 0x400 * (c - 0xD800) + (d - 0xDC00);
+ if (is_lo_surrogate(d)) {
+ c = from_surrogate(c, d);
p = p1;
}
}
size_t len, name_len;
int capture_index;
- name_len = strlen(name);
p = (char *)s->group_names.buf;
+ if (!p) return -1;
buf_end = (char *)s->group_names.buf + s->group_names.size;
+ name_len = strlen(name);
capture_index = 1;
while (p < buf_end) {
len = strlen(p);
s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size;
- put_u32(s->byte_code.buf + 3, s->byte_code.size - RE_HEADER_LEN);
+ put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN,
+ s->byte_code.size - RE_HEADER_LEN);
/* add the named groups if needed */
if (s->group_names.size > (s->capture_count - 1)) {
(c == '_'));
}
-#define GET_CHAR(c, cptr, cbuf_end) \
+#define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \
do { \
if (cbuf_type == 0) { \
c = *cptr++; \
} else { \
- uint32_t __c1; \
- c = *(uint16_t *)cptr; \
- cptr += 2; \
- if (c >= 0xd800 && c < 0xdc00 && \
- cbuf_type == 2 && cptr < cbuf_end) { \
- __c1 = *(uint16_t *)cptr; \
- if (__c1 >= 0xdc00 && __c1 < 0xe000) { \
- c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
- cptr += 2; \
+ const uint16_t *_p = (const uint16_t *)cptr; \
+ const uint16_t *_end = (const uint16_t *)cbuf_end; \
+ c = *_p++; \
+ if (is_hi_surrogate(c) && cbuf_type == 2) { \
+ if (_p < _end && is_lo_surrogate(*_p)) { \
+ c = from_surrogate(c, *_p++); \
} \
} \
+ cptr = (const void *)_p; \
} \
} while (0)
-#define PEEK_CHAR(c, cptr, cbuf_end) \
+#define PEEK_CHAR(c, cptr, cbuf_end, cbuf_type) \
do { \
if (cbuf_type == 0) { \
c = cptr[0]; \
} else { \
- uint32_t __c1; \
- c = ((uint16_t *)cptr)[0]; \
- if (c >= 0xd800 && c < 0xdc00 && \
- cbuf_type == 2 && (cptr + 2) < cbuf_end) { \
- __c1 = ((uint16_t *)cptr)[1]; \
- if (__c1 >= 0xdc00 && __c1 < 0xe000) { \
- c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
+ const uint16_t *_p = (const uint16_t *)cptr; \
+ const uint16_t *_end = (const uint16_t *)cbuf_end; \
+ c = *_p++; \
+ if (is_hi_surrogate(c) && cbuf_type == 2) { \
+ if (_p < _end && is_lo_surrogate(*_p)) { \
+ c = from_surrogate(c, *_p); \
} \
} \
} \
} while (0)
-#define PEEK_PREV_CHAR(c, cptr, cbuf_start) \
+#define PEEK_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \
do { \
if (cbuf_type == 0) { \
c = cptr[-1]; \
} else { \
- uint32_t __c1; \
- c = ((uint16_t *)cptr)[-1]; \
- if (c >= 0xdc00 && c < 0xe000 && \
- cbuf_type == 2 && (cptr - 4) >= cbuf_start) { \
- __c1 = ((uint16_t *)cptr)[-2]; \
- if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \
- c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
+ const uint16_t *_p = (const uint16_t *)cptr - 1; \
+ const uint16_t *_start = (const uint16_t *)cbuf_start; \
+ c = *_p; \
+ if (is_lo_surrogate(c) && cbuf_type == 2) { \
+ if (_p > _start && is_hi_surrogate(_p[-1])) { \
+ c = from_surrogate(*--_p, c); \
} \
} \
} \
} while (0)
-#define GET_PREV_CHAR(c, cptr, cbuf_start) \
+#define GET_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \
do { \
if (cbuf_type == 0) { \
cptr--; \
c = cptr[0]; \
} else { \
- uint32_t __c1; \
- cptr -= 2; \
- c = ((uint16_t *)cptr)[0]; \
- if (c >= 0xdc00 && c < 0xe000 && \
- cbuf_type == 2 && cptr > cbuf_start) { \
- __c1 = ((uint16_t *)cptr)[-1]; \
- if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \
- cptr -= 2; \
- c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
+ const uint16_t *_p = (const uint16_t *)cptr - 1; \
+ const uint16_t *_start = (const uint16_t *)cbuf_start; \
+ c = *_p; \
+ if (is_lo_surrogate(c) && cbuf_type == 2) { \
+ if (_p > _start && is_hi_surrogate(_p[-1])) { \
+ c = from_surrogate(*--_p, c); \
} \
} \
+ cptr = (const void *)_p; \
} \
} while (0)
-#define PREV_CHAR(cptr, cbuf_start) \
+#define PREV_CHAR(cptr, cbuf_start, cbuf_type) \
do { \
if (cbuf_type == 0) { \
cptr--; \
} else { \
- cptr -= 2; \
- if (cbuf_type == 2) { \
- c = ((uint16_t *)cptr)[0]; \
- if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) { \
- c = ((uint16_t *)cptr)[-1]; \
- if (c >= 0xd800 && c < 0xdc00) \
- cptr -= 2; \
+ const uint16_t *_p = (const uint16_t *)cptr - 1; \
+ const uint16_t *_start = (const uint16_t *)cbuf_start; \
+ if (is_lo_surrogate(*_p) && cbuf_type == 2) { \
+ if (_p > _start && is_hi_surrogate(_p[-1])) { \
+ --_p; \
} \
} \
+ cptr = (const void *)_p; \
} \
} while (0)
/* go backward */
char_count = get_u32(pc + 12);
for(i = 0; i < char_count; i++) {
- PREV_CHAR(cptr, s->cbuf);
+ PREV_CHAR(cptr, s->cbuf, cbuf_type);
}
pc = (pc + 16) + (int)get_u32(pc);
rs->cptr = cptr;
test_char:
if (cptr >= cbuf_end)
goto no_match;
- GET_CHAR(c, cptr, cbuf_end);
+ GET_CHAR(c, cptr, cbuf_end, cbuf_type);
if (s->ignore_case) {
c = lre_canonicalize(c, s->is_unicode);
}
break;
if (!s->multi_line)
goto no_match;
- PEEK_PREV_CHAR(c, cptr, s->cbuf);
+ PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
if (!is_line_terminator(c))
goto no_match;
break;
break;
if (!s->multi_line)
goto no_match;
- PEEK_CHAR(c, cptr, cbuf_end);
+ PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
if (!is_line_terminator(c))
goto no_match;
break;
case REOP_dot:
if (cptr == cbuf_end)
goto no_match;
- GET_CHAR(c, cptr, cbuf_end);
+ GET_CHAR(c, cptr, cbuf_end, cbuf_type);
if (is_line_terminator(c))
goto no_match;
break;
case REOP_any:
if (cptr == cbuf_end)
goto no_match;
- GET_CHAR(c, cptr, cbuf_end);
+ GET_CHAR(c, cptr, cbuf_end, cbuf_type);
break;
case REOP_save_start:
case REOP_save_end:
if (cptr == s->cbuf) {
v1 = FALSE;
} else {
- PEEK_PREV_CHAR(c, cptr, s->cbuf);
+ PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
v1 = is_word_char(c);
}
/* current char */
if (cptr >= cbuf_end) {
v2 = FALSE;
} else {
- PEEK_CHAR(c, cptr, cbuf_end);
+ PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
v2 = is_word_char(c);
}
if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
while (cptr1 < cptr1_end) {
if (cptr >= cbuf_end)
goto no_match;
- GET_CHAR(c1, cptr1, cptr1_end);
- GET_CHAR(c2, cptr, cbuf_end);
+ GET_CHAR(c1, cptr1, cptr1_end, cbuf_type);
+ GET_CHAR(c2, cptr, cbuf_end, cbuf_type);
if (s->ignore_case) {
c1 = lre_canonicalize(c1, s->is_unicode);
c2 = lre_canonicalize(c2, s->is_unicode);
while (cptr1 > cptr1_start) {
if (cptr == s->cbuf)
goto no_match;
- GET_PREV_CHAR(c1, cptr1, cptr1_start);
- GET_PREV_CHAR(c2, cptr, s->cbuf);
+ GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type);
+ GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type);
if (s->ignore_case) {
c1 = lre_canonicalize(c1, s->is_unicode);
c2 = lre_canonicalize(c2, s->is_unicode);
pc += 2;
if (cptr >= cbuf_end)
goto no_match;
- GET_CHAR(c, cptr, cbuf_end);
+ GET_CHAR(c, cptr, cbuf_end, cbuf_type);
if (s->ignore_case) {
c = lre_canonicalize(c, s->is_unicode);
}
pc += 2;
if (cptr >= cbuf_end)
goto no_match;
- GET_CHAR(c, cptr, cbuf_end);
+ GET_CHAR(c, cptr, cbuf_end, cbuf_type);
if (s->ignore_case) {
c = lre_canonicalize(c, s->is_unicode);
}
/* go to the previous char */
if (cptr == s->cbuf)
goto no_match;
- PREV_CHAR(cptr, s->cbuf);
+ PREV_CHAR(cptr, s->cbuf, cbuf_type);
break;
case REOP_simple_greedy_quant:
{
int re_flags, i, alloca_size, ret;
StackInt *stack_buf;
- re_flags = bc_buf[RE_HEADER_FLAGS];
+ re_flags = lre_get_flags(bc_buf);
s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0;
uint32_t re_bytecode_len;
if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0)
return NULL;
- re_bytecode_len = get_u32(bc_buf + 3);
- return (const char *)(bc_buf + 7 + re_bytecode_len);
+ re_bytecode_len = get_u32(bc_buf + RE_HEADER_BYTECODE_LEN);
+ return (const char *)(bc_buf + RE_HEADER_LEN + re_bytecode_len);
}
#ifdef TEST
int main(int argc, char **argv)
{
- int len, ret, i;
+ int len, flags, ret, i;
uint8_t *bc;
char error_msg[64];
uint8_t *capture[CAPTURE_COUNT_MAX * 2];
const char *input;
int input_len, capture_count;
- if (argc < 3) {
- printf("usage: %s regexp input\n", argv[0]);
- exit(1);
+ if (argc < 4) {
+ printf("usage: %s regexp flags input\n", argv[0]);
+ return 1;
}
+ flags = atoi(argv[2]);
bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1],
- strlen(argv[1]), 0, NULL);
+ strlen(argv[1]), flags, NULL);
if (!bc) {
fprintf(stderr, "error: %s\n", error_msg);
exit(1);
}
- input = argv[2];
+ input = argv[3];
input_len = strlen(input);
ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
{
if (unlikely(c >= 0x10000)) {
/* surrogate pair */
- c -= 0x10000;
- if (string_buffer_putc16(s, (c >> 10) + 0xd800))
+ if (string_buffer_putc16(s, get_hi_surrogate(c)))
return -1;
- c = (c & 0x3ff) + 0xdc00;
+ c = get_lo_surrogate(c);
}
return string_buffer_putc16(s, c);
}
idx = *pidx;
if (p->is_wide_char) {
c = p->u.str16[idx++];
- if (c >= 0xd800 && c < 0xdc00 && idx < p->len) {
+ if (is_hi_surrogate(c) && idx < p->len) {
c1 = p->u.str16[idx];
- if (c1 >= 0xdc00 && c1 < 0xe000) {
- c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
+ if (is_lo_surrogate(c1)) {
+ c = from_surrogate(c, c1);
idx++;
}
}
} else if (c <= 0x10FFFF) {
p = p_next;
/* surrogate pair */
- c -= 0x10000;
- string_buffer_putc16(b, (c >> 10) + 0xd800);
- c = (c & 0x3ff) + 0xdc00;
+ string_buffer_putc16(b, get_hi_surrogate(c));
+ c = get_lo_surrogate(c);
} else {
/* invalid char */
c = 0xfffd;
if (c < 0x80) {
*q++ = c;
} else {
- if (c >= 0xd800 && c < 0xdc00) {
+ if (is_hi_surrogate(c)) {
if (pos < len && !cesu8) {
c1 = src[pos];
- if (c1 >= 0xdc00 && c1 < 0xe000) {
+ if (is_lo_surrogate(c1)) {
pos++;
- /* surrogate pair */
- c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
+ c = from_surrogate(c, c1);
} else {
/* Keep unmatched surrogate code points */
/* c = 0xfffd; */ /* error */
goto fail;
break;
default:
- if (c < 32 || (c >= 0xd800 && c < 0xe000)) {
+ if (c < 32 || is_surrogate(c)) {
snprintf(buf, sizeof(buf), "\\u%04x", c);
if (string_buffer_puts8(b, buf))
goto fail;
-1 if none */
static int js_string_find_invalid_codepoint(JSString *p)
{
- int i, c;
+ int i;
if (!p->is_wide_char)
return -1;
for(i = 0; i < p->len; i++) {
- c = p->u.str16[i];
- if (c >= 0xD800 && c <= 0xDFFF) {
- if (c >= 0xDC00 || (i + 1) >= p->len)
- return i;
- c = p->u.str16[i + 1];
- if (c < 0xDC00 || c > 0xDFFF)
+ uint32_t c = p->u.str16[i];
+ if (is_surrogate(c)) {
+ if (is_hi_surrogate(c) && (i + 1) < p->len
+ && is_lo_surrogate(p->u.str16[i + 1])) {
+ i++;
+ } else {
return i;
- i++;
+ }
}
}
return -1;
{
JSValue str, ret;
JSString *p;
- int c, i;
+ int i;
str = JS_ToStringCheckObject(ctx, this_val);
if (JS_IsException(str))
p = JS_VALUE_GET_STRING(ret);
for (; i < p->len; i++) {
- c = p->u.str16[i];
- if (c >= 0xD800 && c <= 0xDFFF) {
- if (c >= 0xDC00 || (i + 1) >= p->len) {
- p->u.str16[i] = 0xFFFD;
+ uint32_t c = p->u.str16[i];
+ if (is_surrogate(c)) {
+ if (is_hi_surrogate(c) && (i + 1) < p->len
+ && is_lo_surrogate(p->u.str16[i + 1])) {
+ i++;
} else {
- c = p->u.str16[i + 1];
- if (c < 0xDC00 || c > 0xDFFF) {
- p->u.str16[i] = 0xFFFD;
- } else {
- i++;
- }
+ p->u.str16[i] = 0xFFFD;
}
}
}
idx--;
if (p->is_wide_char) {
c = p->u.str16[idx];
- if (c >= 0xdc00 && c < 0xe000 && idx > 0) {
+ if (is_lo_surrogate(c) && idx > 0) {
c1 = p->u.str16[idx - 1];
- if (c1 >= 0xd800 && c1 <= 0xdc00) {
- c = (((c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000;
+ if (is_hi_surrogate(c1)) {
+ c = from_surrogate(c1, c);
idx--;
}
}
}
c = (c << 6) | (c1 & 0x3f);
}
- if (c < c_min || c > 0x10FFFF ||
- (c >= 0xd800 && c < 0xe000)) {
+ if (c < c_min || c > 0x10FFFF || is_surrogate(c)) {
js_throw_URIError(ctx, "malformed UTF-8");
goto fail;
}
if (isURIUnescaped(c, isComponent)) {
string_buffer_putc16(b, c);
} else {
- if (c >= 0xdc00 && c <= 0xdfff) {
+ if (is_lo_surrogate(c)) {
js_throw_URIError(ctx, "invalid character");
goto fail;
- } else if (c >= 0xd800 && c <= 0xdbff) {
+ } else if (is_hi_surrogate(c)) {
if (k >= p->len) {
js_throw_URIError(ctx, "expecting surrogate pair");
goto fail;
}
c1 = string_get(p, k);
k++;
- if (c1 < 0xdc00 || c1 > 0xdfff) {
+ if (!is_lo_surrogate(c1)) {
js_throw_URIError(ctx, "expecting surrogate pair");
goto fail;
}
- c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
+ c = from_surrogate(c, c1);
}
if (c < 0x80) {
encodeURI_hex(b, c);