]> git.kaiwu.me - quickjs.git/commitdiff
regexp: ensure that the bytecode size grows linearly with respect to
authorFabrice Bellard <fabrice@bellard.org>
Sat, 29 Nov 2025 11:39:52 +0000 (12:39 +0100)
committerFabrice Bellard <fabrice@bellard.org>
Sat, 29 Nov 2025 11:39:52 +0000 (12:39 +0100)
the input regexp.

This way, pathological regexps such as
/(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(:?(?:a|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+|)+/ are no longer an issue. The generated bytecode is also simpler and
faster.

libregexp-opcode.h
libregexp.c

index 9908cf373da0640557382c4d93a2d2847955f526..f0e23454bde87363340ce255cfcd445bd218638d 100644 (file)
@@ -45,6 +45,10 @@ DEF(save_start, 2) /* save start position */
 DEF(save_end, 2) /* save end position, must come after saved_start */
 DEF(save_reset, 3) /* reset save positions */
 DEF(loop, 6) /* decrement the top the stack and goto if != 0 */
+DEF(loop_split_goto_first, 10)
+DEF(loop_split_next_first, 10)
+DEF(loop_check_adv_split_goto_first, 10)
+DEF(loop_check_adv_split_next_first, 10)
 DEF(push_i32, 6) /* push integer on the stack */
 DEF(word_boundary, 1)
 DEF(word_boundary_i, 1)
index 28f407b7208d0fac5aa6c60ca43199b16b93b9bd..d880b1198f401b92ea454b83f6ed4321e5d035dd 100644 (file)
@@ -532,6 +532,19 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
             val += (pos + 6);
             printf(" %u, %u", val2, val);
             break;
+        case REOP_loop_split_goto_first:
+        case REOP_loop_split_next_first:
+        case REOP_loop_check_adv_split_goto_first:
+        case REOP_loop_check_adv_split_next_first:
+            {
+                uint32_t limit;
+                val2 = buf[pos + 1];
+                limit = get_u32(buf + pos + 2);
+                val = get_u32(buf + pos + 6);
+                val += (pos + 10);
+                printf(" %u, %u, %u", val2, limit, val);
+            }
+            break;
         case REOP_save_start:
         case REOP_save_end:
         case REOP_back_reference:
@@ -620,6 +633,17 @@ static int re_emit_goto_u8(REParseState *s, int op, uint32_t arg, uint32_t val)
     return pos;
 }
 
+static int re_emit_goto_u8_u32(REParseState *s, int op, uint32_t arg0, uint32_t arg1, uint32_t val)
+{
+    int pos;
+    dbuf_putc(&s->byte_code, op);
+    dbuf_putc(&s->byte_code, arg0);
+    dbuf_put_u32(&s->byte_code, arg1);
+    pos = s->byte_code.size;
+    dbuf_put_u32(&s->byte_code, val - (pos + 4));
+    return pos;
+}
+
 static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
 {
     dbuf_putc(&s->byte_code, op);
@@ -2183,62 +2207,46 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
                         if (dbuf_insert(&s->byte_code, last_atom_start, 11 + add_zero_advance_check * 2))
                             goto out_of_memory;
                         pos = last_atom_start;
+                        s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
+                        put_u32(s->byte_code.buf + pos, 6 + add_zero_advance_check * 2 + len + 10);
+                        pos += 4;
+
                         s->byte_code.buf[pos++] = REOP_push_i32;
                         s->byte_code.buf[pos++] = 0;
                         put_u32(s->byte_code.buf + pos, quant_max);
                         pos += 4;
-
-                        s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
-                        put_u32(s->byte_code.buf + pos, len + 6 + add_zero_advance_check * 2 * 2);
-                        pos += 4;
+                        last_atom_start = pos;
                         if (add_zero_advance_check) {
                             s->byte_code.buf[pos++] = REOP_push_char_pos;
                             s->byte_code.buf[pos++] = 0;
-                            re_emit_op_u8(s, REOP_check_advance, 0);
                         }
-                        re_emit_goto_u8(s, REOP_loop, 0, last_atom_start + 6);
+                        re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max, last_atom_start);
                     }
                 } else if (quant_min == 1 && quant_max == INT32_MAX &&
                            !add_zero_advance_check) {
                     re_emit_goto(s, REOP_split_next_first - greedy,
                                  last_atom_start);
                 } else {
-                    if (quant_min == 1) {
-                        /* nothing to add */
-                    } else {
-                        if (dbuf_insert(&s->byte_code, last_atom_start, 6))
-                            goto out_of_memory;
-                        s->byte_code.buf[last_atom_start++] = REOP_push_i32;
-                        s->byte_code.buf[last_atom_start++] = 0;
-                        put_u32(s->byte_code.buf + last_atom_start, quant_min);
-                        last_atom_start += 4;
-                        re_emit_goto_u8(s, REOP_loop, 0, last_atom_start);
+                    if (quant_min == quant_max)
+                        add_zero_advance_check = FALSE;
+                    if (dbuf_insert(&s->byte_code, last_atom_start, 6 + add_zero_advance_check * 2))
+                        goto out_of_memory;
+                    /* Note: we assume the string length is < INT32_MAX */
+                    pos = last_atom_start;
+                    s->byte_code.buf[pos++] = REOP_push_i32;
+                    s->byte_code.buf[pos++] = 0;
+                    put_u32(s->byte_code.buf + pos, quant_max);
+                    pos += 4;
+                    last_atom_start = pos;
+                    if (add_zero_advance_check) {
+                        s->byte_code.buf[pos++] = REOP_push_char_pos;
+                        s->byte_code.buf[pos++] = 0;
                     }
-                    if (quant_max == INT32_MAX) {
-                        pos = s->byte_code.size;
-                        re_emit_op_u32(s, REOP_split_goto_first + greedy,
-                                       len + 5 + add_zero_advance_check * 2 * 2);
-                        if (add_zero_advance_check)
-                            re_emit_op_u8(s, REOP_push_char_pos, 0);
-                        /* copy the atom */
-                        dbuf_put_self(&s->byte_code, last_atom_start, len);
-                        if (add_zero_advance_check)
-                            re_emit_op_u8(s, REOP_check_advance, 0);
-                        re_emit_goto(s, REOP_goto, pos);
-                    } else if (quant_max > quant_min) {
-                        re_emit_op_u8(s, REOP_push_i32, 0);
-                        dbuf_put_u32(&s->byte_code, quant_max - quant_min);
-                        
-                        pos = s->byte_code.size;
-                        re_emit_op_u32(s, REOP_split_goto_first + greedy,
-                                       len + 6 + add_zero_advance_check * 2 * 2);
-                        if (add_zero_advance_check)
-                            re_emit_op_u8(s, REOP_push_char_pos, 0);
-                        /* copy the atom */
-                        dbuf_put_self(&s->byte_code, last_atom_start, len);
-                        if (add_zero_advance_check)
-                            re_emit_op_u8(s, REOP_check_advance, 0);
-                        re_emit_goto_u8(s, REOP_loop, 0, pos);
+                    if (quant_min == quant_max) {
+                        /* a simple loop is enough */
+                        re_emit_goto_u8(s, REOP_loop, 0, last_atom_start);
+                    } else {
+                        re_emit_goto_u8_u32(s, (add_zero_advance_check ? REOP_loop_check_adv_split_next_first : REOP_loop_split_next_first) - greedy, 0, quant_max - quant_min, last_atom_start);
                     }
                 }
                 last_atom_start = -1;
@@ -2352,10 +2360,18 @@ static int compute_stack_size(uint8_t *bc_buf, int bc_buf_len)
             break;
         case REOP_check_advance:
         case REOP_loop:
+        case REOP_loop_split_goto_first:
+        case REOP_loop_split_next_first:
             assert(stack_size > 0);
             stack_size--;
             bc_buf[pos + 1] = stack_size;
             break;
+        case REOP_loop_check_adv_split_goto_first:
+        case REOP_loop_check_adv_split_next_first:
+            assert(stack_size >= 2);
+            stack_size -= 2;
+            bc_buf[pos + 1] = stack_size;
+            break;
         case REOP_range:
         case REOP_range_i:
             val = get_u16(bc_buf + pos + 1);
@@ -2956,6 +2972,56 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
                 }
             }
             break;
+        case REOP_loop_split_goto_first:
+        case REOP_loop_split_next_first:
+        case REOP_loop_check_adv_split_goto_first:
+        case REOP_loop_check_adv_split_next_first:
+            {
+                const uint8_t *pc1;
+                uint32_t val2, limit;
+                idx = pc[0];
+                limit = get_u32(pc + 1);
+                val = get_u32(pc + 5);
+                pc += 9;
+
+                /* decrement the counter */
+                val2 = (uintptr_t)aux_stack[idx] - 1;
+                SAVE_AUX_STACK(idx, (void *)(uintptr_t)val2);
+
+                if (val2 > limit) {
+                    /* normal loop if counter > limit */
+                    pc += (int)val;
+                    if (lre_poll_timeout(s))
+                        return LRE_RET_TIMEOUT;
+                } else {
+                    /* check advance */
+                    if ((opcode == REOP_loop_check_adv_split_goto_first ||
+                         opcode == REOP_loop_check_adv_split_next_first) &&
+                        aux_stack[idx + 1] == cptr &&
+                        val2 != limit) {
+                        goto no_match;
+                    }
+                    
+                    /* otherwise conditional split */
+                    if (val2 != 0) {
+                        if (opcode == REOP_loop_split_next_first ||
+                            opcode == REOP_loop_check_adv_split_next_first) {
+                            pc1 = pc + (int)val;
+                        } else {
+                            pc1 = pc;
+                            pc = pc + (int)val;
+                        }
+                        CHECK_STACK_SPACE(3);
+                        sp[0].ptr = (uint8_t *)pc1;
+                        sp[1].ptr = (uint8_t *)cptr;
+                        sp[2].bp.val = bp - s->stack_buf;
+                        sp[2].bp.type = RE_EXEC_STATE_SPLIT;
+                        sp += 3;
+                        bp = sp;
+                    }
+                }
+            }
+            break;
         case REOP_push_char_pos:
             idx = pc[0];
             pc++;