aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2021-08-09 11:26:34 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2021-08-09 11:26:34 -0400
commit0e6aa8747d439bb7f08f95e358f0509c50396785 (patch)
tree64977d63a54b4f5c266692966282c6d7aab3aba7 /src/backend/utils/adt
parent76ad24400d73fa10d527844d50bedf7dacb1e87b (diff)
downloadpostgresql-0e6aa8747d439bb7f08f95e358f0509c50396785.tar.gz
postgresql-0e6aa8747d439bb7f08f95e358f0509c50396785.zip
Avoid determining regexp subexpression matches, when possible.
Identifying the precise match locations for parenthesized subexpressions is a fairly expensive task given the way our regexp engine works, both at regexp compile time (where we must create an optimized NFA for each parenthesized subexpression) and at runtime (where determining exact match locations requires laborious search). Up to now we've made little attempt to optimize this situation. This patch identifies cases where we know at compile time that we won't need to know subexpression match locations, and teaches the regexp compiler to not bother creating per-subexpression regexps for parenthesis pairs that are not referenced by backrefs elsewhere in the regexp. (To preserve semantics, we obviously still have to pin down the match locations of backref references.) Users could have obtained the same results before this by being careful to write "non capturing" parentheses wherever possible, but few people bother with that. Discussion: https://postgr.es/m/2219936.1628115334@sss.pgh.pa.us
Diffstat (limited to 'src/backend/utils/adt')
-rw-r--r--src/backend/utils/adt/jsonpath_gram.y8
-rw-r--r--src/backend/utils/adt/regexp.c12
2 files changed, 18 insertions, 2 deletions
diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y
index de3d97931ef..bd5d4488a06 100644
--- a/src/backend/utils/adt/jsonpath_gram.y
+++ b/src/backend/utils/adt/jsonpath_gram.y
@@ -583,6 +583,14 @@ jspConvertRegexFlags(uint32 xflags)
errmsg("XQuery \"x\" flag (expanded regular expressions) is not implemented")));
}
+ /*
+ * We'll never need sub-match details at execution. While
+ * RE_compile_and_execute would set this flag anyway, force it on here to
+ * ensure that the regex cache entries created by makeItemLikeRegex are
+ * useful.
+ */
+ cflags |= REG_NOSUB;
+
return cflags;
}
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 484d4265fd8..268cee1cbed 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -347,6 +347,10 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
{
regex_t *re;
+ /* Use REG_NOSUB if caller does not want sub-match details */
+ if (nmatch < 2)
+ cflags |= REG_NOSUB;
+
/* Compile RE */
re = RE_compile_and_cache(text_re, cflags, collation);
@@ -1412,6 +1416,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
int orig_len;
pg_wchar *wide_str;
int wide_len;
+ int cflags;
regex_t *cpattern;
regmatch_t *pmatch;
int pmatch_len;
@@ -1430,7 +1435,10 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
/* set up the compiled pattern */
- cpattern = RE_compile_and_cache(pattern, re_flags->cflags, collation);
+ cflags = re_flags->cflags;
+ if (!use_subpatterns)
+ cflags |= REG_NOSUB;
+ cpattern = RE_compile_and_cache(pattern, cflags, collation);
/* do we want to remember subpatterns? */
if (use_subpatterns && cpattern->re_nsub > 0)
@@ -1952,7 +1960,7 @@ regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
if (case_insensitive)
cflags |= REG_ICASE;
- re = RE_compile_and_cache(text_re, cflags, collation);
+ re = RE_compile_and_cache(text_re, cflags | REG_NOSUB, collation);
/* Examine it to see if there's a fixed prefix */
re_result = pg_regprefix(re, &str, &slen);