diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2021-08-09 11:26:34 -0400 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2021-08-09 11:26:34 -0400 |
commit | 0e6aa8747d439bb7f08f95e358f0509c50396785 (patch) | |
tree | 64977d63a54b4f5c266692966282c6d7aab3aba7 /src/backend/regex/regexec.c | |
parent | 76ad24400d73fa10d527844d50bedf7dacb1e87b (diff) | |
download | postgresql-0e6aa8747d439bb7f08f95e358f0509c50396785.tar.gz postgresql-0e6aa8747d439bb7f08f95e358f0509c50396785.zip |
Avoid determining regexp subexpression matches, when possible.
Identifying the precise match locations for parenthesized subexpressions
is a fairly expensive task given the way our regexp engine works, both
at regexp compile time (where we must create an optimized NFA for each
parenthesized subexpression) and at runtime (where determining exact
match locations requires laborious search).
Up to now we've made little attempt to optimize this situation. This
patch identifies cases where we know at compile time that we won't
need to know subexpression match locations, and teaches the regexp
compiler to not bother creating per-subexpression regexps for
parenthesis pairs that are not referenced by backrefs elsewhere in
the regexp. (To preserve semantics, we obviously still have to
pin down the match locations of backref references.) Users could
have obtained the same results before this by being careful to
write "non capturing" parentheses wherever possible, but few people
bother with that.
Discussion: https://postgr.es/m/2219936.1628115334@sss.pgh.pa.us
Diffstat (limited to 'src/backend/regex/regexec.c')
-rw-r--r-- | src/backend/regex/regexec.c | 36 |
1 files changed, 25 insertions, 11 deletions
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index 5b9a0878203..db54ebfba40 100644 --- a/src/backend/regex/regexec.c +++ b/src/backend/regex/regexec.c @@ -213,12 +213,9 @@ pg_regexec(regex_t *re, return REG_NOMATCH; backref = (v->g->info & REG_UBACKREF) ? 1 : 0; v->eflags = flags; - if (v->g->cflags & REG_NOSUB) - nmatch = 0; /* override client */ - v->nmatch = nmatch; - if (backref) + if (backref && nmatch <= v->g->nsub) { - /* need work area */ + /* need larger work area */ if (v->g->nsub + 1 <= LOCALMAT) v->pmatch = mat; else @@ -229,7 +226,17 @@ pg_regexec(regex_t *re, v->nmatch = v->g->nsub + 1; } else + { + /* we can store results directly in caller's array */ v->pmatch = pmatch; + /* ensure any extra entries in caller's array are filled with -1 */ + if (nmatch > 0) + zapallsubs(pmatch, nmatch); + /* then forget about extra entries, to avoid useless work in find() */ + if (nmatch > v->g->nsub + 1) + nmatch = v->g->nsub + 1; + v->nmatch = nmatch; + } v->details = details; v->start = (chr *) string; v->search_start = (chr *) string + search_start; @@ -290,12 +297,20 @@ pg_regexec(regex_t *re, else st = find(v, &v->g->tree->cnfa, &v->g->cmap); - /* copy (portion of) match vector over if necessary */ - if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) + /* on success, ensure caller's match vector is filled correctly */ + if (st == REG_OKAY && nmatch > 0) { - zapallsubs(pmatch, nmatch); - n = (nmatch < v->nmatch) ? nmatch : v->nmatch; - memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t)); + if (v->pmatch != pmatch) + { + /* copy portion of match vector over from (larger) work area */ + assert(nmatch <= v->nmatch); + memcpy(VS(pmatch), VS(v->pmatch), nmatch * sizeof(regmatch_t)); + } + if (v->g->cflags & REG_NOSUB) + { + /* don't expose possibly-partial sub-match results to caller */ + zapallsubs(pmatch, nmatch); + } } /* clean up */ @@ -752,7 +767,6 @@ cdissect(struct vars *v, break; case '(': /* no-op capture node */ assert(t->child != NULL); - assert(t->capno > 0); er = cdissect(v, t->child, begin, end); break; default: |