aboutsummaryrefslogtreecommitdiff
path: root/src/backend/regex/regexec.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2021-03-02 11:55:12 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2021-03-02 11:55:12 -0500
commit0c3405cf11a12da1a4278c6833f4d979fe06c866 (patch)
tree1a7d72b194224ceb9933533131cebc4ada5a502f /src/backend/regex/regexec.c
parent4aea704a5bfd4b5894a268499369ccab89940c9c (diff)
downloadpostgresql-0c3405cf11a12da1a4278c6833f4d979fe06c866.tar.gz
postgresql-0c3405cf11a12da1a4278c6833f4d979fe06c866.zip
Improve performance of regular expression back-references.
In some cases, at the time that we're doing an NFA-based precheck of whether a backref subexpression can match at a particular place in the string, we already know which substring the referenced subexpression matched. If so, we might as well forget about the NFA and just compare the substring; this is faster and it gives an exact rather than approximate answer. In general, this optimization can help while we are prechecking within the second child expression of a concat node, while the capture was within the first child expression; then the substring was saved during cdissect() of the first child and will be available to NFA checks done while cdissect() recurses into the second child. It can help quite a lot if the tree looks like concat / \ capture concat / \ expensive stuff backref as we will be able to avoid recursively dissecting the "expensive stuff" before discovering that the backref isn't satisfied with a particular midpoint that the lower concat node is testing. This doesn't help if the concat tree is left-deep, as the capture node won't get set soon enough (and it's hard to fix that without changing the engine's match behavior). Fortunately, right-deep concat trees are the common case. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/661609.1614560029@sss.pgh.pa.us
Diffstat (limited to 'src/backend/regex/regexec.c')
-rw-r--r--src/backend/regex/regexec.c24
1 files changed, 21 insertions, 3 deletions
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index cf959899489..8d7777f8c62 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -77,6 +77,9 @@ struct dfa
chr *lastpost; /* location of last cache-flushed success */
chr *lastnopr; /* location of last cache-flushed NOPROGRESS */
struct sset *search; /* replacement-search-pointer memory */
+ int backno; /* if DFA for a backref, subno it refers to */
+ short backmin; /* min repetitions for backref */
+ short backmax; /* max repetitions for backref */
bool ismalloced; /* should this struct dfa be freed? */
bool arraysmalloced; /* should its subsidiary arrays be freed? */
};
@@ -154,6 +157,7 @@ static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
static int matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **);
+static chr *dfa_backref(struct vars *, struct dfa *, chr *, chr *, chr *, bool);
static chr *lastcold(struct vars *, struct dfa *);
static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
static void freedfa(struct dfa *);
@@ -342,13 +346,23 @@ static struct dfa *
getsubdfa(struct vars *v,
struct subre *t)
{
- if (v->subdfas[t->id] == NULL)
+ struct dfa *d = v->subdfas[t->id];
+
+ if (d == NULL)
{
- v->subdfas[t->id] = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
+ d = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
if (ISERR())
return NULL;
+ /* set up additional info if this is a backref node */
+ if (t->op == 'b')
+ {
+ d->backno = t->backno;
+ d->backmin = t->min;
+ d->backmax = t->max;
+ }
+ v->subdfas[t->id] = d;
}
- return v->subdfas[t->id];
+ return d;
}
/*
@@ -369,6 +383,7 @@ getladfa(struct vars *v,
v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
if (ISERR())
return NULL;
+ /* a LACON can't contain a backref, so nothing else to do */
}
return v->ladfas[n];
}
@@ -927,6 +942,9 @@ crevcondissect(struct vars *v,
/*
* cbrdissect - dissect match for backref node
+ *
+ * The backref match might already have been verified by dfa_backref(),
+ * but we don't know that for sure so must check it here.
*/
static int /* regexec return code */
cbrdissect(struct vars *v,