/*
TODO:
-
+ - remove REOP_char_i and REOP_range_i by precomputing the case folding.
+ - add specific opcodes for simple unicode property tests so that the
+ generated bytecode is smaller.
- Add a lock step execution mode (=linear time execution guaranteed)
when the regular expression is "simple" i.e. no backreference nor
complicated lookahead. The opcodes are designed for this execution
goto default_escape;
if (cr_init_char_range(s, cr, c))
return -1;
- c = CLASS_RANGE_BASE;
+ c += CLASS_RANGE_BASE;
break;
case 'c':
c = *p;
case REOP_char32_i:
case REOP_dot:
case REOP_any:
+ case REOP_space:
+ case REOP_not_space:
need_check_adv = FALSE;
break;
case REOP_line_start:
case 'b':
case 'B':
if (p[1] != 'b') {
- re_emit_op(s, s->ignore_case ? REOP_not_word_boundary_i : REOP_not_word_boundary);
+ re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_not_word_boundary_i : REOP_not_word_boundary);
} else {
- re_emit_op(s, s->ignore_case ? REOP_word_boundary_i : REOP_word_boundary);
+ re_emit_op(s, s->ignore_case && s->is_unicode ? REOP_word_boundary_i : REOP_word_boundary);
}
p += 2;
break;
if (is_backward_dir)
re_emit_op(s, REOP_prev);
if (c >= CLASS_RANGE_BASE) {
- int ret;
- ret = re_emit_string_list(s, cr);
+ int ret = 0;
+ /* optimize the common 'space' tests */
+ if (c == (CLASS_RANGE_BASE + CHAR_RANGE_s)) {
+ re_emit_op(s, REOP_space);
+ } else if (c == (CLASS_RANGE_BASE + CHAR_RANGE_S)) {
+ re_emit_op(s, REOP_not_space);
+ } else {
+ ret = re_emit_string_list(s, cr);
+ }
re_string_list_free(cr);
if (ret)
return -1;
return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
}
-static BOOL is_word_char(uint32_t c)
-{
- return ((c >= '0' && c <= '9') ||
- (c >= 'a' && c <= 'z') ||
- (c >= 'A' && c <= 'Z') ||
- (c == '_'));
-}
-
#define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \
do { \
if (cbuf_type == 0) { \
/* return 1 if match, 0 if not match or < 0 if error. */
static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
- uint8_t **regs, const uint8_t *pc, const uint8_t *cptr)
+ const uint8_t *pc, const uint8_t *cptr)
{
int opcode;
int cbuf_type;
}
/* avoid saving the previous value if already saved */
-#define SAVE_REG(idx, value) \
+#define SAVE_CAPTURE_CHECK(idx, value) \
{ \
StackElem *sp1; \
sp1 = sp; \
for(;;) { \
if (sp1 > bp) { \
- if (sp1[-2].val == -(int)(idx + 1)) \
+ if (sp1[-2].val == idx) \
break; \
sp1 -= 2; \
} else { \
CHECK_STACK_SPACE(2); \
- sp[0].val = -(int)(idx + 1); \
- sp[1].ptr = regs[idx]; \
+ sp[0].val = idx; \
+ sp[1].ptr = capture[idx]; \
sp += 2; \
break; \
} \
} \
- regs[idx] = (value); \
+ capture[idx] = (value); \
}
REExecStateEnum type;
if (bp == s->stack_buf)
return 0;
- /* undo the modifications to capture[] and regs[] */
+ /* undo the modifications to capture[] */
while (sp > bp) {
- intptr_t idx2 = sp[-2].val;
- if (idx2 >= 0)
- capture[idx2] = sp[-1].ptr;
- else
- regs[-idx2 - 1] = sp[-1].ptr;
+ capture[sp[-2].val] = sp[-1].ptr;
sp -= 2;
}
for(;;) {
REExecStateEnum type;
type = bp[-1].bp.type;
- /* undo the modifications to capture[] and regs[] */
+ /* undo the modifications to capture[] */
while (sp > bp) {
- intptr_t idx2 = sp[-2].val;
- if (idx2 >= 0)
- capture[idx2] = sp[-1].ptr;
- else
- regs[-idx2 - 1] = sp[-1].ptr;
+ capture[sp[-2].val] = sp[-1].ptr;
sp -= 2;
}
pc = sp[-3].ptr;
goto no_match;
GET_CHAR(c, cptr, cbuf_end, cbuf_type);
break;
+ case REOP_space:
+ if (cptr == cbuf_end)
+ goto no_match;
+ GET_CHAR(c, cptr, cbuf_end, cbuf_type);
+ if (!lre_is_space(c))
+ goto no_match;
+ break;
+ case REOP_not_space:
+ if (cptr == cbuf_end)
+ goto no_match;
+ GET_CHAR(c, cptr, cbuf_end, cbuf_type);
+ if (lre_is_space(c))
+ goto no_match;
+ break;
case REOP_save_start:
case REOP_save_end:
val = *pc++;
}
break;
case REOP_set_i32:
- idx = pc[0];
+ idx = 2 * s->capture_count + pc[0];
val = get_u32(pc + 1);
pc += 5;
- SAVE_REG(idx, (void *)(uintptr_t)val);
+ SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val);
break;
case REOP_loop:
{
uint32_t val2;
- idx = pc[0];
+ idx = 2 * s->capture_count + pc[0];
val = get_u32(pc + 1);
pc += 5;
- val2 = (uintptr_t)regs[idx] - 1;
- SAVE_REG(idx, (void *)(uintptr_t)val2);
+ val2 = (uintptr_t)capture[idx] - 1;
+ SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2);
if (val2 != 0) {
pc += (int)val;
if (lre_poll_timeout(s))
{
const uint8_t *pc1;
uint32_t val2, limit;
- idx = pc[0];
+ idx = 2 * s->capture_count + pc[0];
limit = get_u32(pc + 1);
val = get_u32(pc + 5);
pc += 9;
/* decrement the counter */
- val2 = (uintptr_t)regs[idx] - 1;
- SAVE_REG(idx, (void *)(uintptr_t)val2);
+ val2 = (uintptr_t)capture[idx] - 1;
+ SAVE_CAPTURE_CHECK(idx, (void *)(uintptr_t)val2);
if (val2 > limit) {
/* normal loop if counter > limit */
/* check advance */
if ((opcode == REOP_loop_check_adv_split_goto_first ||
opcode == REOP_loop_check_adv_split_next_first) &&
- regs[idx + 1] == cptr &&
+ capture[idx + 1] == cptr &&
val2 != limit) {
goto no_match;
}
}
break;
case REOP_set_char_pos:
- idx = pc[0];
+ idx = 2 * s->capture_count + pc[0];
pc++;
- SAVE_REG(idx, (uint8_t *)cptr);
+ SAVE_CAPTURE_CHECK(idx, (uint8_t *)cptr);
break;
case REOP_check_advance:
- idx = pc[0];
+ idx = 2 * s->capture_count + pc[0];
pc++;
- if (regs[idx] == cptr)
+ if (capture[idx] == cptr)
goto no_match;
break;
case REOP_word_boundary:
v1 = FALSE;
} else {
PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
- if (ignore_case)
- c = lre_canonicalize(c, s->is_unicode);
- v1 = is_word_char(c);
+ if (c < 256) {
+ v1 = (lre_is_word_byte(c) != 0);
+ } else {
+ v1 = ignore_case && (c == 0x017f || c == 0x212a);
+ }
}
/* current char */
if (cptr >= cbuf_end) {
v2 = FALSE;
} else {
PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
- if (ignore_case)
- c = lre_canonicalize(c, s->is_unicode);
- v2 = is_word_char(c);
+ if (c < 256) {
+ v2 = (lre_is_word_byte(c) != 0);
+ } else {
+ v2 = ignore_case && (c == 0x017f || c == 0x212a);
+ }
}
if (v1 ^ v2 ^ is_boundary)
goto no_match;
int cbuf_type, void *opaque)
{
REExecContext s_s, *s = &s_s;
- int re_flags, i, ret, register_count;
- uint8_t **regs;
+ int re_flags, i, ret;
const uint8_t *cptr;
re_flags = lre_get_flags(bc_buf);
for(i = 0; i < s->capture_count * 2; i++)
capture[i] = NULL;
- /* XXX: modify the API so that the registers are allocated after
- the captures to suppress some tests */
- register_count = bc_buf[RE_HEADER_REGISTER_COUNT];
- regs = alloca(register_count * sizeof(regs[0]));
cptr = cbuf + (cindex << cbuf_type);
if (0 < cindex && cindex < clen && s->cbuf_type == 2) {
}
}
- ret = lre_exec_backtrack(s, capture, regs, bc_buf + RE_HEADER_LEN,
- cptr);
+ ret = lre_exec_backtrack(s, capture, bc_buf + RE_HEADER_LEN, cptr);
+
if (s->stack_buf != s->static_stack_buf)
lre_realloc(s->opaque, s->stack_buf, 0);
return ret;
}
+int lre_get_alloc_count(const uint8_t *bc_buf)
+{
+ return bc_buf[RE_HEADER_CAPTURE_COUNT] * 2 +
+ bc_buf[RE_HEADER_REGISTER_COUNT];
+}
+
int lre_get_capture_count(const uint8_t *bc_buf)
{
return bc_buf[RE_HEADER_CAPTURE_COUNT];
int len, flags, ret, i;
uint8_t *bc;
char error_msg[64];
- uint8_t *capture[CAPTURE_COUNT_MAX * 2];
+ uint8_t *capture;
const char *input;
int input_len, capture_count;
input = argv[3];
input_len = strlen(input);
+ capture = malloc(sizeof(capture[0]) * lre_get_alloc_count(bc));
ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
printf("ret=%d\n", ret);
if (ret == 1) {
printf("\n");
}
}
+ free(capture);
return 0;
}
#endif
goto add_tail;
goto done;
}
- q = p;
for (q = p; (q += !r) <= s - r - !r; q = p = e + r) {
e = string_indexof(sp, rp, q);
if (e < 0)
JSValue indices, indices_groups;
uint8_t *re_bytecode;
uint8_t **capture, *str_buf;
- int rc, capture_count, shift, i, re_flags;
+ int rc, capture_count, shift, i, re_flags, alloc_count;
int64_t last_index;
const char *group_name_ptr;
JSObject *p_obj;
last_index = 0;
}
str = JS_VALUE_GET_STRING(str_val);
- capture_count = lre_get_capture_count(re_bytecode);
- if (capture_count > 0) {
- capture = js_malloc(ctx, sizeof(capture[0]) * capture_count * 2);
+ alloc_count = lre_get_alloc_count(re_bytecode);
+ if (alloc_count > 0) {
+ capture = js_malloc(ctx, sizeof(capture[0]) * alloc_count);
if (!capture)
goto fail;
}
+ capture_count = lre_get_capture_count(re_bytecode);
shift = str->is_wide_char;
str_buf = str->u.str8;
if (last_index > str->len) {
uint8_t *re_bytecode;
int ret;
uint8_t **capture, *str_buf;
- int capture_count, shift, re_flags;
+ int capture_count, alloc_count, shift, re_flags;
int next_src_pos, start, end;
int64_t last_index;
StringBuffer b_s, *b = &b_s;
if (js_regexp_get_lastIndex(ctx, &last_index, this_val))
goto fail;
}
- capture_count = lre_get_capture_count(re_bytecode);
- if (capture_count > 0) {
- capture = js_malloc(ctx, sizeof(capture[0]) * capture_count * 2);
+ alloc_count = lre_get_alloc_count(re_bytecode);
+ if (alloc_count > 0) {
+ capture = js_malloc(ctx, sizeof(capture[0]) * alloc_count);
if (!capture)
goto fail;
}
+ capture_count = lre_get_capture_count(re_bytecode);
fullUnicode = ((re_flags & (LRE_FLAG_UNICODE | LRE_FLAG_UNICODE_SETS)) != 0);
shift = str->is_wide_char;
str_buf = str->u.str8;