aboutsummaryrefslogtreecommitdiff
path: root/src/backend/regex/regexec.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2021-02-20 19:07:45 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2021-02-20 19:07:45 -0500
commit581043089472816061a7fd381f40572191dfa48f (patch)
tree84043d4072aa2e68d87959b2a5dbd565c2fec678 /src/backend/regex/regexec.c
parentcebc1d34e5207c37871708f91be65dd839760b5f (diff)
downloadpostgresql-581043089472816061a7fd381f40572191dfa48f.tar.gz
postgresql-581043089472816061a7fd381f40572191dfa48f.zip
Convert regex engine's subre tree from binary to N-ary style.
Instead of having left and right child links in subre structs, have a single child link plus a sibling link. Multiple children of a tree node are now reached by chasing the sibling chain. The beneficiary of this is alternation tree nodes. A regular expression with N (>1) branches is now represented by one alternation node with N children, rather than a tree that includes N alternation nodes as well as N children. While the old representation didn't really cost anything extra at execution time, it was pretty horrid for compilation purposes, because each of the alternation nodes had its own NFA, which we were too stupid not to separately optimize. (To make matters worse, all of those NFAs described the entire alternation pattern, not just the portion of it that one might expect from the tree structure.) We continue to require concatenation nodes to have exactly two children. This data structure is now prepared to support more, but the executor's logic would need some careful redesign, and it's not clear that a lot of benefit could be had. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
Diffstat (limited to 'src/backend/regex/regexec.c')
-rw-r--r--src/backend/regex/regexec.c104
1 files changed, 57 insertions, 47 deletions
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index adcbcc0a8ea..4541cf9a7e0 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -640,6 +640,8 @@ static void
zaptreesubs(struct vars *v,
struct subre *t)
{
+ struct subre *t2;
+
if (t->op == '(')
{
int n = t->subno;
@@ -652,10 +654,8 @@ zaptreesubs(struct vars *v,
}
}
- if (t->left != NULL)
- zaptreesubs(v, t->left);
- if (t->right != NULL)
- zaptreesubs(v, t->right);
+ for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
+ zaptreesubs(v, t2);
}
/*
@@ -714,35 +714,35 @@ cdissect(struct vars *v,
switch (t->op)
{
case '=': /* terminal node */
- assert(t->left == NULL && t->right == NULL);
+ assert(t->child == NULL);
er = REG_OKAY; /* no action, parent did the work */
break;
case 'b': /* back reference */
- assert(t->left == NULL && t->right == NULL);
+ assert(t->child == NULL);
er = cbrdissect(v, t, begin, end);
break;
case '.': /* concatenation */
- assert(t->left != NULL && t->right != NULL);
- if (t->left->flags & SHORTER) /* reverse scan */
+ assert(t->child != NULL);
+ if (t->child->flags & SHORTER) /* reverse scan */
er = crevcondissect(v, t, begin, end);
else
er = ccondissect(v, t, begin, end);
break;
case '|': /* alternation */
- assert(t->left != NULL);
+ assert(t->child != NULL);
er = caltdissect(v, t, begin, end);
break;
case '*': /* iteration */
- assert(t->left != NULL);
- if (t->left->flags & SHORTER) /* reverse scan */
+ assert(t->child != NULL);
+ if (t->child->flags & SHORTER) /* reverse scan */
er = creviterdissect(v, t, begin, end);
else
er = citerdissect(v, t, begin, end);
break;
case '(': /* capturing */
- assert(t->left != NULL && t->right == NULL);
+ assert(t->child != NULL);
assert(t->subno > 0);
- er = cdissect(v, t->left, begin, end);
+ er = cdissect(v, t->child, begin, end);
if (er == REG_OKAY)
subset(v, t, begin, end);
break;
@@ -770,19 +770,22 @@ ccondissect(struct vars *v,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
+ struct subre *left = t->child;
+ struct subre *right = left->sibling;
struct dfa *d;
struct dfa *d2;
chr *mid;
int er;
assert(t->op == '.');
- assert(t->left != NULL && t->left->cnfa.nstates > 0);
- assert(t->right != NULL && t->right->cnfa.nstates > 0);
- assert(!(t->left->flags & SHORTER));
+ assert(left != NULL && left->cnfa.nstates > 0);
+ assert(right != NULL && right->cnfa.nstates > 0);
+ assert(right->sibling == NULL);
+ assert(!(left->flags & SHORTER));
- d = getsubdfa(v, t->left);
+ d = getsubdfa(v, left);
NOERR();
- d2 = getsubdfa(v, t->right);
+ d2 = getsubdfa(v, right);
NOERR();
MDEBUG(("%d: ccondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
@@ -799,10 +802,10 @@ ccondissect(struct vars *v,
/* try this midpoint on for size */
if (longest(v, d2, mid, end, (int *) NULL) == end)
{
- er = cdissect(v, t->left, begin, mid);
+ er = cdissect(v, left, begin, mid);
if (er == REG_OKAY)
{
- er = cdissect(v, t->right, mid, end);
+ er = cdissect(v, right, mid, end);
if (er == REG_OKAY)
{
/* satisfaction */
@@ -831,8 +834,8 @@ ccondissect(struct vars *v,
return REG_NOMATCH;
}
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
- zaptreesubs(v, t->left);
- zaptreesubs(v, t->right);
+ zaptreesubs(v, left);
+ zaptreesubs(v, right);
}
/* can't get here */
@@ -848,19 +851,22 @@ crevcondissect(struct vars *v,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
+ struct subre *left = t->child;
+ struct subre *right = left->sibling;
struct dfa *d;
struct dfa *d2;
chr *mid;
int er;
assert(t->op == '.');
- assert(t->left != NULL && t->left->cnfa.nstates > 0);
- assert(t->right != NULL && t->right->cnfa.nstates > 0);
- assert(t->left->flags & SHORTER);
+ assert(left != NULL && left->cnfa.nstates > 0);
+ assert(right != NULL && right->cnfa.nstates > 0);
+ assert(right->sibling == NULL);
+ assert(left->flags & SHORTER);
- d = getsubdfa(v, t->left);
+ d = getsubdfa(v, left);
NOERR();
- d2 = getsubdfa(v, t->right);
+ d2 = getsubdfa(v, right);
NOERR();
MDEBUG(("%d: crevcondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
@@ -877,10 +883,10 @@ crevcondissect(struct vars *v,
/* try this midpoint on for size */
if (longest(v, d2, mid, end, (int *) NULL) == end)
{
- er = cdissect(v, t->left, begin, mid);
+ er = cdissect(v, left, begin, mid);
if (er == REG_OKAY)
{
- er = cdissect(v, t->right, mid, end);
+ er = cdissect(v, right, mid, end);
if (er == REG_OKAY)
{
/* satisfaction */
@@ -909,8 +915,8 @@ crevcondissect(struct vars *v,
return REG_NOMATCH;
}
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
- zaptreesubs(v, t->left);
- zaptreesubs(v, t->right);
+ zaptreesubs(v, left);
+ zaptreesubs(v, right);
}
/* can't get here */
@@ -1011,26 +1017,30 @@ caltdissect(struct vars *v,
struct dfa *d;
int er;
- /* We loop, rather than tail-recurse, to handle a chain of alternatives */
+ assert(t->op == '|');
+
+ t = t->child;
+ /* there should be at least 2 alternatives */
+ assert(t != NULL && t->sibling != NULL);
+
while (t != NULL)
{
- assert(t->op == '|');
- assert(t->left != NULL && t->left->cnfa.nstates > 0);
+ assert(t->cnfa.nstates > 0);
MDEBUG(("%d: caltdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
- d = getsubdfa(v, t->left);
+ d = getsubdfa(v, t);
NOERR();
if (longest(v, d, begin, end, (int *) NULL) == end)
{
MDEBUG(("%d: caltdissect matched\n", t->id));
- er = cdissect(v, t->left, begin, end);
+ er = cdissect(v, t, begin, end);
if (er != REG_NOMATCH)
return er;
}
NOERR();
- t = t->right;
+ t = t->sibling;
}
return REG_NOMATCH;
@@ -1056,8 +1066,8 @@ citerdissect(struct vars *v,
int er;
assert(t->op == '*');
- assert(t->left != NULL && t->left->cnfa.nstates > 0);
- assert(!(t->left->flags & SHORTER));
+ assert(t->child != NULL && t->child->cnfa.nstates > 0);
+ assert(!(t->child->flags & SHORTER));
assert(begin <= end);
MDEBUG(("%d: citerdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
@@ -1094,7 +1104,7 @@ citerdissect(struct vars *v,
return REG_ESPACE;
endpts[0] = begin;
- d = getsubdfa(v, t->left);
+ d = getsubdfa(v, t->child);
if (ISERR())
{
FREE(endpts);
@@ -1172,8 +1182,8 @@ citerdissect(struct vars *v,
for (i = nverified + 1; i <= k; i++)
{
- zaptreesubs(v, t->left);
- er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
+ zaptreesubs(v, t->child);
+ er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
if (er == REG_OKAY)
{
nverified = i;
@@ -1258,8 +1268,8 @@ creviterdissect(struct vars *v,
int er;
assert(t->op == '*');
- assert(t->left != NULL && t->left->cnfa.nstates > 0);
- assert(t->left->flags & SHORTER);
+ assert(t->child != NULL && t->child->cnfa.nstates > 0);
+ assert(t->child->flags & SHORTER);
assert(begin <= end);
MDEBUG(("%d: creviterdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
@@ -1299,7 +1309,7 @@ creviterdissect(struct vars *v,
return REG_ESPACE;
endpts[0] = begin;
- d = getsubdfa(v, t->left);
+ d = getsubdfa(v, t->child);
if (ISERR())
{
FREE(endpts);
@@ -1383,8 +1393,8 @@ creviterdissect(struct vars *v,
for (i = nverified + 1; i <= k; i++)
{
- zaptreesubs(v, t->left);
- er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
+ zaptreesubs(v, t->child);
+ er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
if (er == REG_OKAY)
{
nverified = i;