summaryrefslogtreecommitdiff
path: root/libregexp.c
diff options
context:
space:
mode:
authorFabrice Bellard <fabrice@bellard.org>2024-01-08 18:42:29 +0100
committerFabrice Bellard <fabrice@bellard.org>2024-01-08 18:42:29 +0100
commitaf308614a8b89e80a6e6a3f270292ef724931d1e (patch)
treef9f0bf6510bdccc486f1010543386d10abe93874 /libregexp.c
parentaac24640b13825b550d3e6dbce0a3f9ac8baf577 (diff)
downloadquickjs-af308614a8b89e80a6e6a3f270292ef724931d1e.tar.gz
quickjs-af308614a8b89e80a6e6a3f270292ef724931d1e.zip
fixed regexp case insensitive flag
Diffstat (limited to 'libregexp.c')
-rw-r--r--libregexp.c57
1 files changed, 1 insertions, 56 deletions
diff --git a/libregexp.c b/libregexp.c
index dab8fa1..eeafd0b 100644
--- a/libregexp.c
+++ b/libregexp.c
@@ -34,9 +34,6 @@
/*
TODO:
- - Add full unicode canonicalize rules for character ranges (not
- really useful but needed for exact "ignorecase" compatibility).
-
- Add a lock step execution mode (=linear time execution guaranteed)
when the regular expression is "simple" i.e. no backreference nor
complicated lookahead. The opcodes are designed for this execution
@@ -120,33 +117,6 @@ static int dbuf_insert(DynBuf *s, int pos, int len)
return 0;
}
-/* canonicalize with the specific JS regexp rules */
-static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16)
-{
- uint32_t res[LRE_CC_RES_LEN_MAX];
- int len;
- if (is_utf16) {
- if (likely(c < 128)) {
- if (c >= 'A' && c <= 'Z')
- c = c - 'A' + 'a';
- } else {
- lre_case_conv(res, c, 2);
- c = res[0];
- }
- } else {
- if (likely(c < 128)) {
- if (c >= 'a' && c <= 'z')
- c = c - 'a' + 'A';
- } else {
- /* legacy regexp: to upper case if single char >= 128 */
- len = lre_case_conv(res, c, FALSE);
- if (len == 1 && res[0] >= 128)
- c = res[0];
- }
- }
- return c;
-}
-
static const uint16_t char_range_d[] = {
1,
0x0030, 0x0039 + 1,
@@ -245,31 +215,6 @@ static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
return -1;
}
-static int cr_canonicalize(CharRange *cr)
-{
- CharRange a;
- uint32_t pt[2];
- int i, ret;
-
- cr_init(&a, cr->mem_opaque, lre_realloc);
- pt[0] = 'a';
- pt[1] = 'z' + 1;
- ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER);
- if (ret)
- goto fail;
- /* convert to upper case */
- /* XXX: the generic unicode case would be much more complicated
- and not really useful */
- for(i = 0; i < a.len; i++) {
- a.points[i] += 'A' - 'a';
- }
- /* Note: for simplicity we keep the lower case ranges */
- ret = cr_union1(cr, a.points, a.len);
- fail:
- cr_free(&a);
- return ret;
-}
-
#ifdef DUMP_REOP
static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
int buf_len)
@@ -922,7 +867,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
}
}
if (s->ignore_case) {
- if (cr_canonicalize(cr))
+ if (cr_regexp_canonicalize(cr, s->is_utf16))
goto memory_error;
}
if (invert) {