aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/regex/regc_nfa.c296
-rw-r--r--src/backend/regex/regcomp.c5
-rw-r--r--src/backend/regex/rege_dfa.c57
-rw-r--r--src/backend/regex/regprefix.c4
-rw-r--r--src/include/regex/regguts.h15
5 files changed, 376 insertions, 1 deletions
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c
index ff98bfd694e..1a6c9ce1838 100644
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@@ -65,6 +65,8 @@ newnfa(struct vars *v,
nfa->v = v;
nfa->bos[0] = nfa->bos[1] = COLORLESS;
nfa->eos[0] = nfa->eos[1] = COLORLESS;
+ nfa->flags = 0;
+ nfa->minmatchall = nfa->maxmatchall = -1;
nfa->parent = parent; /* Precedes newfstate so parent is valid. */
nfa->post = newfstate(nfa, '@'); /* number 0 */
nfa->pre = newfstate(nfa, '>'); /* number 1 */
@@ -2875,8 +2877,14 @@ analyze(struct nfa *nfa)
if (NISERR())
return 0;
+ /* Detect whether NFA can't match anything */
if (nfa->pre->outs == NULL)
return REG_UIMPOSSIBLE;
+
+ /* Detect whether NFA matches all strings (possibly with length bounds) */
+ checkmatchall(nfa);
+
+ /* Detect whether NFA can possibly match a zero-length string */
for (a = nfa->pre->outs; a != NULL; a = a->outchain)
for (aa = a->to->outs; aa != NULL; aa = aa->outchain)
if (aa->to == nfa->post)
@@ -2885,6 +2893,282 @@ analyze(struct nfa *nfa)
}
/*
+ * checkmatchall - does the NFA represent no more than a string length test?
+ *
+ * If so, set nfa->minmatchall and nfa->maxmatchall correctly (they are -1
+ * to begin with) and set the MATCHALL bit in nfa->flags.
+ *
+ * To succeed, we require all arcs to be PLAIN RAINBOW arcs, except for those
+ * for pseudocolors (i.e., BOS/BOL/EOS/EOL). We must be able to reach the
+ * post state via RAINBOW arcs, and if there are any loops in the graph, they
+ * must be loop-to-self arcs, ensuring that each loop iteration consumes
+ * exactly one character. (Longer loops are problematic because they create
+ * non-consecutive possible match lengths; we have no good way to represent
+ * that situation for lengths beyond the DUPINF limit.)
+ *
+ * Pseudocolor arcs complicate things a little. We know that they can only
+ * appear as pre-state outarcs (for BOS/BOL) or post-state inarcs (for
+ * EOS/EOL). There, they must exactly replicate the parallel RAINBOW arcs,
+ * e.g. if the pre state has one RAINBOW outarc to state 2, it must have BOS
+ * and BOL outarcs to state 2, and no others. Missing or extra pseudocolor
+ * arcs can occur, meaning that the NFA involves some constraint on the
+ * adjacent characters, which makes it not a matchall NFA.
+ */
+static void
+checkmatchall(struct nfa *nfa)
+{
+ bool hasmatch[DUPINF + 1];
+ int minmatch,
+ maxmatch,
+ morematch;
+
+ /*
+ * hasmatch[i] will be set true if a match of length i is feasible, for i
+ * from 0 to DUPINF-1. hasmatch[DUPINF] will be set true if every match
+ * length of DUPINF or more is feasible.
+ */
+ memset(hasmatch, 0, sizeof(hasmatch));
+
+ /*
+ * Recursively search the graph for all-RAINBOW paths to the "post" state,
+ * starting at the "pre" state. The -1 initial depth accounts for the
+ * fact that transitions out of the "pre" state are not part of the
+ * matched string. We likewise don't count the final transition to the
+ * "post" state as part of the match length. (But we still insist that
+ * those transitions have RAINBOW arcs, otherwise there are lookbehind or
+ * lookahead constraints at the start/end of the pattern.)
+ */
+ if (!checkmatchall_recurse(nfa, nfa->pre, false, -1, hasmatch))
+ return;
+
+ /*
+ * We found some all-RAINBOW paths, and not anything that we couldn't
+ * handle. Now verify that pseudocolor arcs adjacent to the pre and post
+ * states match the RAINBOW arcs there. (We could do this while
+ * recursing, but it's expensive and unlikely to fail, so do it last.)
+ */
+ if (!check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[0]) ||
+ !check_out_colors_match(nfa->pre, nfa->bos[0], RAINBOW) ||
+ !check_out_colors_match(nfa->pre, RAINBOW, nfa->bos[1]) ||
+ !check_out_colors_match(nfa->pre, nfa->bos[1], RAINBOW))
+ return;
+ if (!check_in_colors_match(nfa->post, RAINBOW, nfa->eos[0]) ||
+ !check_in_colors_match(nfa->post, nfa->eos[0], RAINBOW) ||
+ !check_in_colors_match(nfa->post, RAINBOW, nfa->eos[1]) ||
+ !check_in_colors_match(nfa->post, nfa->eos[1], RAINBOW))
+ return;
+
+ /*
+ * hasmatch[] now represents the set of possible match lengths; but we
+ * want to reduce that to a min and max value, because it doesn't seem
+ * worth complicating regexec.c to deal with nonconsecutive possible match
+ * lengths. Find min and max of first run of lengths, then verify there
+ * are no nonconsecutive lengths.
+ */
+ for (minmatch = 0; minmatch <= DUPINF; minmatch++)
+ {
+ if (hasmatch[minmatch])
+ break;
+ }
+ assert(minmatch <= DUPINF); /* else checkmatchall_recurse lied */
+ for (maxmatch = minmatch; maxmatch < DUPINF; maxmatch++)
+ {
+ if (!hasmatch[maxmatch + 1])
+ break;
+ }
+ for (morematch = maxmatch + 1; morematch <= DUPINF; morematch++)
+ {
+ if (hasmatch[morematch])
+ return; /* fail, there are nonconsecutive lengths */
+ }
+
+ /* Success, so record the info */
+ nfa->minmatchall = minmatch;
+ nfa->maxmatchall = maxmatch;
+ nfa->flags |= MATCHALL;
+}
+
+/*
+ * checkmatchall_recurse - recursive search for checkmatchall
+ *
+ * s is the current state
+ * foundloop is true if any predecessor state has a loop-to-self
+ * depth is the current recursion depth (starting at -1)
+ * hasmatch[] is the output area for recording feasible match lengths
+ *
+ * We return true if there is at least one all-RAINBOW path to the "post"
+ * state and no non-matchall paths; otherwise false. Note we assume that
+ * any dead-end paths have already been removed, else we might return
+ * false unnecessarily.
+ */
+static bool
+checkmatchall_recurse(struct nfa *nfa, struct state *s,
+ bool foundloop, int depth,
+ bool *hasmatch)
+{
+ bool result = false;
+ struct arc *a;
+
+ /*
+ * Since this is recursive, it could be driven to stack overflow. But we
+ * need not treat that as a hard failure; just deem the NFA non-matchall.
+ */
+ if (STACK_TOO_DEEP(nfa->v->re))
+ return false;
+
+ /*
+ * Likewise, if we get to a depth too large to represent correctly in
+ * maxmatchall, fail quietly.
+ */
+ if (depth >= DUPINF)
+ return false;
+
+ /*
+ * Scan the outarcs to detect cases we can't handle, and to see if there
+ * is a loop-to-self here. We need to know about any such loop before we
+ * recurse, so it's hard to avoid making two passes over the outarcs. In
+ * any case, checking for showstoppers before we recurse is probably best.
+ */
+ for (a = s->outs; a != NULL; a = a->outchain)
+ {
+ if (a->type != PLAIN)
+ return false; /* any LACONs make it non-matchall */
+ if (a->co != RAINBOW)
+ {
+ if (nfa->cm->cd[a->co].flags & PSEUDO)
+ {
+ /*
+ * Pseudocolor arc: verify it's in a valid place (this seems
+ * quite unlikely to fail, but let's be sure).
+ */
+ if (s == nfa->pre &&
+ (a->co == nfa->bos[0] || a->co == nfa->bos[1]))
+ /* okay BOS/BOL arc */ ;
+ else if (a->to == nfa->post &&
+ (a->co == nfa->eos[0] || a->co == nfa->eos[1]))
+ /* okay EOS/EOL arc */ ;
+ else
+ return false; /* unexpected pseudocolor arc */
+ /* We'll finish checking these arcs after the recursion */
+ continue;
+ }
+ return false; /* any other color makes it non-matchall */
+ }
+ if (a->to == s)
+ {
+ /*
+ * We found a cycle of length 1, so remember that to pass down to
+ * successor states. (It doesn't matter if there was also such a
+ * loop at a predecessor state.)
+ */
+ foundloop = true;
+ }
+ else if (a->to->tmp)
+ {
+ /* We found a cycle of length > 1, so fail. */
+ return false;
+ }
+ }
+
+ /* We need to recurse, so mark state as under consideration */
+ assert(s->tmp == NULL);
+ s->tmp = s;
+
+ for (a = s->outs; a != NULL; a = a->outchain)
+ {
+ if (a->co != RAINBOW)
+ continue; /* ignore pseudocolor arcs */
+ if (a->to == nfa->post)
+ {
+ /* We found an all-RAINBOW path to the post state */
+ result = true;
+ /* Record potential match lengths */
+ assert(depth >= 0);
+ hasmatch[depth] = true;
+ if (foundloop)
+ {
+ /* A predecessor loop makes all larger lengths match, too */
+ int i;
+
+ for (i = depth + 1; i <= DUPINF; i++)
+ hasmatch[i] = true;
+ }
+ }
+ else if (a->to != s)
+ {
+ /* This is a new path forward; recurse to investigate */
+ result = checkmatchall_recurse(nfa, a->to,
+ foundloop, depth + 1,
+ hasmatch);
+ /* Fail if any recursive path fails */
+ if (!result)
+ break;
+ }
+ }
+
+ s->tmp = NULL;
+ return result;
+}
+
+/*
+ * check_out_colors_match - subroutine for checkmatchall
+ *
+ * Check if every s outarc of color co1 has a matching outarc of color co2.
+ * (checkmatchall_recurse already verified that all of the outarcs are PLAIN,
+ * so we need not examine arc types here. Also, since it verified that there
+ * are only RAINBOW and pseudocolor arcs, there shouldn't be enough arcs for
+ * this brute-force O(N^2) implementation to cause problems.)
+ */
+static bool
+check_out_colors_match(struct state *s, color co1, color co2)
+{
+ struct arc *a1;
+ struct arc *a2;
+
+ for (a1 = s->outs; a1 != NULL; a1 = a1->outchain)
+ {
+ if (a1->co != co1)
+ continue;
+ for (a2 = s->outs; a2 != NULL; a2 = a2->outchain)
+ {
+ if (a2->co == co2 && a2->to == a1->to)
+ break;
+ }
+ if (a2 == NULL)
+ return false;
+ }
+ return true;
+}
+
+/*
+ * check_in_colors_match - subroutine for checkmatchall
+ *
+ * Check if every s inarc of color co1 has a matching inarc of color co2.
+ * (For paranoia's sake, ignore any non-PLAIN arcs here. But we're still
+ * not expecting very many arcs.)
+ */
+static bool
+check_in_colors_match(struct state *s, color co1, color co2)
+{
+ struct arc *a1;
+ struct arc *a2;
+
+ for (a1 = s->ins; a1 != NULL; a1 = a1->inchain)
+ {
+ if (a1->type != PLAIN || a1->co != co1)
+ continue;
+ for (a2 = s->ins; a2 != NULL; a2 = a2->inchain)
+ {
+ if (a2->type == PLAIN && a2->co == co2 && a2->from == a1->from)
+ break;
+ }
+ if (a2 == NULL)
+ return false;
+ }
+ return true;
+}
+
+/*
* compact - construct the compact representation of an NFA
*/
static void
@@ -2930,7 +3214,9 @@ compact(struct nfa *nfa,
cnfa->eos[0] = nfa->eos[0];
cnfa->eos[1] = nfa->eos[1];
cnfa->ncolors = maxcolor(nfa->cm) + 1;
- cnfa->flags = 0;
+ cnfa->flags = nfa->flags;
+ cnfa->minmatchall = nfa->minmatchall;
+ cnfa->maxmatchall = nfa->maxmatchall;
ca = cnfa->arcs;
for (s = nfa->states; s != NULL; s = s->next)
@@ -3034,6 +3320,11 @@ dumpnfa(struct nfa *nfa,
fprintf(f, ", eos [%ld]", (long) nfa->eos[0]);
if (nfa->eos[1] != COLORLESS)
fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
+ if (nfa->flags & HASLACONS)
+ fprintf(f, ", haslacons");
+ if (nfa->flags & MATCHALL)
+ fprintf(f, ", minmatchall %d, maxmatchall %d",
+ nfa->minmatchall, nfa->maxmatchall);
fprintf(f, "\n");
for (s = nfa->states; s != NULL; s = s->next)
{
@@ -3201,6 +3492,9 @@ dumpcnfa(struct cnfa *cnfa,
fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]);
if (cnfa->flags & HASLACONS)
fprintf(f, ", haslacons");
+ if (cnfa->flags & MATCHALL)
+ fprintf(f, ", minmatchall %d, maxmatchall %d",
+ cnfa->minmatchall, cnfa->maxmatchall);
fprintf(f, "\n");
for (st = 0; st < cnfa->nstates; st++)
dumpcstate(st, cnfa, f);
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index ae8dbe58191..39e837adc2a 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -175,6 +175,11 @@ static void cleanup(struct nfa *);
static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
static long analyze(struct nfa *);
+static void checkmatchall(struct nfa *);
+static bool checkmatchall_recurse(struct nfa *, struct state *,
+ bool, int, bool *);
+static bool check_out_colors_match(struct state *, color, color);
+static bool check_in_colors_match(struct state *, color, color);
static void compact(struct nfa *, struct cnfa *);
static void carcsort(struct carc *, size_t);
static int carc_cmp(const void *, const void *);
diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c
index 32be2592c56..89d162ed6af 100644
--- a/src/backend/regex/rege_dfa.c
+++ b/src/backend/regex/rege_dfa.c
@@ -58,6 +58,29 @@ longest(struct vars *v,
if (hitstopp != NULL)
*hitstopp = 0;
+ /* fast path for matchall NFAs */
+ if (d->cnfa->flags & MATCHALL)
+ {
+ size_t nchr = stop - start;
+ size_t maxmatchall = d->cnfa->maxmatchall;
+
+ if (nchr < d->cnfa->minmatchall)
+ return NULL;
+ if (maxmatchall == DUPINF)
+ {
+ if (stop == v->stop && hitstopp != NULL)
+ *hitstopp = 1;
+ }
+ else
+ {
+ if (stop == v->stop && nchr <= maxmatchall + 1 && hitstopp != NULL)
+ *hitstopp = 1;
+ if (nchr > maxmatchall)
+ return start + maxmatchall;
+ }
+ return stop;
+ }
+
/* initialize */
css = initialize(v, d, start);
if (css == NULL)
@@ -187,6 +210,24 @@ shortest(struct vars *v,
if (hitstopp != NULL)
*hitstopp = 0;
+ /* fast path for matchall NFAs */
+ if (d->cnfa->flags & MATCHALL)
+ {
+ size_t nchr = min - start;
+
+ if (d->cnfa->maxmatchall != DUPINF &&
+ nchr > d->cnfa->maxmatchall)
+ return NULL;
+ if ((max - start) < d->cnfa->minmatchall)
+ return NULL;
+ if (nchr < d->cnfa->minmatchall)
+ min = start + d->cnfa->minmatchall;
+ if (coldp != NULL)
+ *coldp = start;
+ /* there is no case where we should set *hitstopp */
+ return min;
+ }
+
/* initialize */
css = initialize(v, d, start);
if (css == NULL)
@@ -312,6 +353,22 @@ matchuntil(struct vars *v,
struct sset *ss;
struct colormap *cm = d->cm;
+ /* fast path for matchall NFAs */
+ if (d->cnfa->flags & MATCHALL)
+ {
+ size_t nchr = probe - v->start;
+
+ /*
+ * It might seem that we should check maxmatchall too, but the .* at
+ * the front of the pattern absorbs any extra characters (and it was
+ * tacked on *after* computing minmatchall/maxmatchall). Thus, we
+ * should match if there are at least minmatchall characters.
+ */
+ if (nchr < d->cnfa->minmatchall)
+ return 0;
+ return 1;
+ }
+
/* initialize and startup, or restart, if necessary */
if (cp == NULL || cp > probe)
{
diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c
index e2fbad7a8a9..ec435b6f5f6 100644
--- a/src/backend/regex/regprefix.c
+++ b/src/backend/regex/regprefix.c
@@ -77,6 +77,10 @@ pg_regprefix(regex_t *re,
assert(g->tree != NULL);
cnfa = &g->tree->cnfa;
+ /* matchall NFAs never have a fixed prefix */
+ if (cnfa->flags & MATCHALL)
+ return REG_NOMATCH;
+
/*
* Since a correct NFA should never contain any exit-free loops, it should
* not be possible for our traversal to return to a previously visited NFA
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 6d391083194..82e761bfe57 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -331,6 +331,9 @@ struct nfa
struct colormap *cm; /* the color map */
color bos[2]; /* colors, if any, assigned to BOS and BOL */
color eos[2]; /* colors, if any, assigned to EOS and EOL */
+ int flags; /* flags to pass forward to cNFA */
+ int minmatchall; /* min number of chrs to match, if matchall */
+ int maxmatchall; /* max number of chrs to match, or DUPINF */
struct vars *v; /* simplifies compile error reporting */
struct nfa *parent; /* parent NFA, if any */
};
@@ -353,6 +356,14 @@ struct nfa
*
* Note that in a plain arc, "co" can be RAINBOW; since that's negative,
* it doesn't break the rule about how to recognize LACON arcs.
+ *
+ * We have special markings for "trivial" NFAs that can match any string
+ * (possibly with limits on the number of characters therein). In such a
+ * case, flags & MATCHALL is set (and HASLACONS can't be set). Then the
+ * fields minmatchall and maxmatchall give the minimum and maximum numbers
+ * of characters to match. For example, ".*" produces minmatchall = 0
+ * and maxmatchall = DUPINF, while ".+" produces minmatchall = 1 and
+ * maxmatchall = DUPINF.
*/
struct carc
{
@@ -366,6 +377,7 @@ struct cnfa
int ncolors; /* number of colors (max color in use + 1) */
int flags;
#define HASLACONS 01 /* uses lookaround constraints */
+#define MATCHALL 02 /* matches all strings of a range of lengths */
int pre; /* setup state number */
int post; /* teardown state number */
color bos[2]; /* colors, if any, assigned to BOS and BOL */
@@ -375,6 +387,9 @@ struct cnfa
struct carc **states; /* vector of pointers to outarc lists */
/* states[n] are pointers into a single malloc'd array of arcs */
struct carc *arcs; /* the area for the lists */
+ /* these fields are used only in a MATCHALL NFA (else they're -1): */
+ int minmatchall; /* min number of chrs to match */
+ int maxmatchall; /* max number of chrs to match, or DUPINF */
};
/*