fixed regexp case insensitive flag

author: Fabrice Bellard <fabrice@bellard.org> 2024-01-08 18:42:29 +0100
committer: Fabrice Bellard <fabrice@bellard.org> 2024-01-08 18:42:29 +0100
commit: af308614a8b89e80a6e6a3f270292ef724931d1e (patch)
tree: f9f0bf6510bdccc486f1010543386d10abe93874 /libregexp.c
parent: aac24640b13825b550d3e6dbce0a3f9ac8baf577 (diff)
download: quickjs-af308614a8b89e80a6e6a3f270292ef724931d1e.tar.gz
quickjs-af308614a8b89e80a6e6a3f270292ef724931d1e.zip
1 files changed, 1 insertions, 56 deletions
diff --git a/libregexp.c b/libregexp.c
index dab8fa1..eeafd0b 100644
--- a/libregexp.c
+++ b/libregexp.c
@@ -34,9 +34,6 @@
 /*
   TODO:
 
-  - Add full unicode canonicalize rules for character ranges (not
-    really useful but needed for exact "ignorecase" compatibility).
-
   - Add a lock step execution mode (=linear time execution guaranteed)
     when the regular expression is "simple" i.e. no backreference nor
     complicated lookahead. The opcodes are designed for this execution
@@ -120,33 +117,6 @@ static int dbuf_insert(DynBuf *s, int pos, int len)
     return 0;
 }
 
-/* canonicalize with the specific JS regexp rules */
-static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16)
-{
-    uint32_t res[LRE_CC_RES_LEN_MAX];
-    int len;
-    if (is_utf16) {
-        if (likely(c < 128)) {
-            if (c >= 'A' && c <= 'Z')
-                c = c - 'A' + 'a';
-        } else {
-            lre_case_conv(res, c, 2);
-            c = res[0];
-        }
-    } else {
-        if (likely(c < 128)) {
-            if (c >= 'a' && c <= 'z')
-                c = c - 'a' + 'A';
-        } else {
-            /* legacy regexp: to upper case if single char >= 128 */
-            len = lre_case_conv(res, c, FALSE);
-            if (len == 1 && res[0] >= 128)
-                c = res[0];
-        }
-    }
-    return c;
-}
-
 static const uint16_t char_range_d[] = {
     1,
     0x0030, 0x0039 + 1,
@@ -245,31 +215,6 @@ static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
     return -1;
 }
 
-static int cr_canonicalize(CharRange *cr)
-{
-    CharRange a;
-    uint32_t pt[2];
-    int i, ret;
-
-    cr_init(&a, cr->mem_opaque, lre_realloc);
-    pt[0] = 'a';
-    pt[1] = 'z' + 1;
-    ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER);
-    if (ret)
-        goto fail;
-    /* convert to upper case */
-    /* XXX: the generic unicode case would be much more complicated
-       and not really useful */
-    for(i = 0; i < a.len; i++) {
-        a.points[i] += 'A' - 'a';
-    }
-    /* Note: for simplicity we keep the lower case ranges */
-    ret = cr_union1(cr, a.points, a.len);
- fail:
-    cr_free(&a);
-    return ret;
-}
-
 #ifdef DUMP_REOP
 static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
                                                      int buf_len)
@@ -922,7 +867,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
         }
     }
     if (s->ignore_case) {
-        if (cr_canonicalize(cr))
+        if (cr_regexp_canonicalize(cr, s->is_utf16))
             goto memory_error;
     }
     if (invert) {
author	Fabrice Bellard <fabrice@bellard.org>	2024-01-08 18:42:29 +0100
committer	Fabrice Bellard <fabrice@bellard.org>	2024-01-08 18:42:29 +0100
commit	af308614a8b89e80a6e6a3f270292ef724931d1e (patch)
tree	f9f0bf6510bdccc486f1010543386d10abe93874 /libregexp.c
parent	aac24640b13825b550d3e6dbce0a3f9ac8baf577 (diff)
download	quickjs-af308614a8b89e80a6e6a3f270292ef724931d1e.tar.gz quickjs-af308614a8b89e80a6e6a3f270292ef724931d1e.zip