aboutsummaryrefslogtreecommitdiff
path: root/src/backend/regex
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2012-07-10 18:00:39 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2012-07-10 18:00:39 -0400
commit18c8dc32ce6759a8837f01e3dd0036cd19ee8683 (patch)
treed2d058f0f362cf22af5856a4e20cc7f814835f87 /src/backend/regex
parentf12960d8c9ce454a37c2082549b7766ce36627d2 (diff)
downloadpostgresql-18c8dc32ce6759a8837f01e3dd0036cd19ee8683.tar.gz
postgresql-18c8dc32ce6759a8837f01e3dd0036cd19ee8683.zip
Back-patch fix for extraction of fixed prefixes from regular expressions.
Back-patch of commits 628cbb50ba80c83917b07a7609ddec12cda172d0 and c6aae3042be5249e672b731ebeb21875b5343010. This has been broken since 7.3, so back-patch to all supported branches.
Diffstat (limited to 'src/backend/regex')
-rw-r--r--src/backend/regex/Makefile2
-rw-r--r--src/backend/regex/README4
-rw-r--r--src/backend/regex/regc_color.c11
-rw-r--r--src/backend/regex/regc_nfa.c34
-rw-r--r--src/backend/regex/regcomp.c2
-rw-r--r--src/backend/regex/rege_dfa.c11
-rw-r--r--src/backend/regex/regprefix.c259
7 files changed, 296 insertions, 27 deletions
diff --git a/src/backend/regex/Makefile b/src/backend/regex/Makefile
index 21e7fa5329b..74a4c0c89d8 100644
--- a/src/backend/regex/Makefile
+++ b/src/backend/regex/Makefile
@@ -12,7 +12,7 @@ subdir = src/backend/regex
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
-OBJS = regcomp.o regerror.o regexec.o regfree.o
+OBJS = regcomp.o regerror.o regexec.o regfree.o regprefix.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/regex/README b/src/backend/regex/README
index 89ba6a62ea2..c5d21e8c99d 100644
--- a/src/backend/regex/README
+++ b/src/backend/regex/README
@@ -7,12 +7,13 @@ So this file is an attempt to reverse-engineer some docs.
General source-file layout
--------------------------
-There are four separately-compilable source files, each exposing exactly
+There are five separately-compilable source files, each exposing exactly
one exported function:
regcomp.c: pg_regcomp
regexec.c: pg_regexec
regerror.c: pg_regerror
regfree.c: pg_regfree
+ regprefix.c: pg_regprefix
(The pg_ prefixes were added by the Postgres project to distinguish this
library version from any similar one that might be present on a particular
system. They'd need to be removed or replaced in any standalone version
@@ -44,6 +45,7 @@ regexec.c Top-level regex execution code
rege_dfa.c DFA creation and execution
regerror.c pg_regerror: generate text for a regex error code
regfree.c pg_regfree: API to free a no-longer-needed regex_t
+regprefix.c Code for extracting a common prefix from a regex_t
The locale-specific code is concerned primarily with case-folding and with
expanding locale-specific character classes, such as [[:alnum:]]. It
diff --git a/src/backend/regex/regc_color.c b/src/backend/regex/regc_color.c
index 2aeb861d976..1c60566fbf5 100644
--- a/src/backend/regex/regc_color.c
+++ b/src/backend/regex/regc_color.c
@@ -66,8 +66,9 @@ initcm(struct vars * v,
cd = cm->cd; /* cm->cd[WHITE] */
cd->sub = NOSUB;
cd->arcs = NULL;
- cd->flags = 0;
+ cd->firstchr = CHR_MIN;
cd->nchrs = CHR_MAX - CHR_MIN + 1;
+ cd->flags = 0;
/* upper levels of tree */
for (t = &cm->tree[0], j = NBYTS - 1; j > 0; t = nextt, j--)
@@ -272,6 +273,7 @@ newcolor(struct colormap * cm)
cd->nchrs = 0;
cd->sub = NOSUB;
cd->arcs = NULL;
+ cd->firstchr = CHR_MIN; /* in case never set otherwise */
cd->flags = 0;
cd->block = NULL;
@@ -371,6 +373,8 @@ subcolor(struct colormap * cm, chr c)
if (co == sco) /* already in an open subcolor */
return co; /* rest is redundant */
cm->cd[co].nchrs--;
+ if (cm->cd[sco].nchrs == 0)
+ cm->cd[sco].firstchr = c;
cm->cd[sco].nchrs++;
setcolor(cm, c, sco);
return sco;
@@ -438,6 +442,11 @@ subrange(struct vars * v,
/*
* subblock - allocate new subcolors for one tree block of chrs, fill in arcs
+ *
+ * Note: subcolors that are created during execution of this function
+ * will not be given a useful value of firstchr; it'll be left as CHR_MIN.
+ * For the current usage of firstchr in pg_regprefix, this does not matter
+ * because such subcolors won't occur in the common prefix of a regex.
*/
static void
subblock(struct vars * v,
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c
index 66a361ee2ff..085842c92b7 100644
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@@ -1330,14 +1330,16 @@ compact(struct nfa * nfa,
for (s = nfa->states; s != NULL; s = s->next)
{
nstates++;
- narcs += 1 + s->nouts + 1;
- /* 1 as a fake for flags, nouts for arcs, 1 as endmarker */
+ narcs += s->nouts + 1; /* need one extra for endmarker */
}
+ cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
- if (cnfa->states == NULL || cnfa->arcs == NULL)
+ if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
{
+ if (cnfa->stflags != NULL)
+ FREE(cnfa->stflags);
if (cnfa->states != NULL)
FREE(cnfa->states);
if (cnfa->arcs != NULL)
@@ -1359,9 +1361,8 @@ compact(struct nfa * nfa,
for (s = nfa->states; s != NULL; s = s->next)
{
assert((size_t) s->no < nstates);
+ cnfa->stflags[s->no] = 0;
cnfa->states[s->no] = ca;
- ca->co = 0; /* clear and skip flags "arc" */
- ca++;
first = ca;
for (a = s->outs; a != NULL; a = a->outchain)
switch (a->type)
@@ -1392,8 +1393,8 @@ compact(struct nfa * nfa,
/* mark no-progress states */
for (a = nfa->pre->outs; a != NULL; a = a->outchain)
- cnfa->states[a->to->no]->co = 1;
- cnfa->states[nfa->pre->no]->co = 1;
+ cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
+ cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
}
/*
@@ -1433,6 +1434,7 @@ freecnfa(struct cnfa * cnfa)
{
assert(cnfa->nstates != 0); /* not empty already */
cnfa->nstates = 0;
+ FREE(cnfa->stflags);
FREE(cnfa->states);
FREE(cnfa->arcs);
}
@@ -1617,7 +1619,7 @@ dumpcnfa(struct cnfa * cnfa,
fprintf(f, ", haslacons");
fprintf(f, "\n");
for (st = 0; st < cnfa->nstates; st++)
- dumpcstate(st, cnfa->states[st], cnfa, f);
+ dumpcstate(st, cnfa, f);
fflush(f);
}
#endif
@@ -1629,22 +1631,20 @@ dumpcnfa(struct cnfa * cnfa,
*/
static void
dumpcstate(int st,
- struct carc * ca,
struct cnfa * cnfa,
FILE *f)
{
- int i;
+ struct carc * ca;
int pos;
- fprintf(f, "%d%s", st, (ca[0].co) ? ":" : ".");
+ fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
pos = 1;
- for (i = 1; ca[i].co != COLORLESS; i++)
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
{
- if (ca[i].co < cnfa->ncolors)
- fprintf(f, "\t[%ld]->%d", (long) ca[i].co, ca[i].to);
+ if (ca->co < cnfa->ncolors)
+ fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
else
- fprintf(f, "\t:%ld:->%d", (long) ca[i].co - cnfa->ncolors,
- ca[i].to);
+ fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
if (pos == 5)
{
fprintf(f, "\n");
@@ -1653,7 +1653,7 @@ dumpcstate(int st,
else
pos++;
}
- if (i == 1 || pos != 1)
+ if (ca == cnfa->states[st] || pos != 1)
fprintf(f, "\n");
fflush(f);
}
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 57055f04abb..ceb6f0f8737 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -162,7 +162,7 @@ static void dumparcs(struct state *, FILE *);
static int dumprarcs(struct arc *, struct state *, FILE *, int);
static void dumparc(struct arc *, struct state *, FILE *);
static void dumpcnfa(struct cnfa *, FILE *);
-static void dumpcstate(int, struct carc *, struct cnfa *, FILE *);
+static void dumpcstate(int, struct cnfa *, FILE *);
#endif
/* === regc_cvec.c === */
static struct cvec *newcvec(int, int);
diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c
index da7a0bf402f..7a7ba5b89cf 100644
--- a/src/backend/regex/rege_dfa.c
+++ b/src/backend/regex/rege_dfa.c
@@ -457,14 +457,14 @@ miss(struct vars * v, /* used only for debug flags */
gotstate = 0;
for (i = 0; i < d->nstates; i++)
if (ISBSET(css->states, i))
- for (ca = cnfa->states[i] + 1; ca->co != COLORLESS; ca++)
+ for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
if (ca->co == co)
{
BSET(d->work, ca->to);
gotstate = 1;
if (ca->to == cnfa->post)
ispost = 1;
- if (!cnfa->states[ca->to]->co)
+ if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
noprogress = 0;
FDEBUG(("%d -> %d\n", i, ca->to));
}
@@ -475,10 +475,9 @@ miss(struct vars * v, /* used only for debug flags */
dolacons = 0;
for (i = 0; i < d->nstates; i++)
if (ISBSET(d->work, i))
- for (ca = cnfa->states[i] + 1; ca->co != COLORLESS;
- ca++)
+ for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
{
- if (ca->co <= cnfa->ncolors)
+ if (ca->co < cnfa->ncolors)
continue; /* NOTE CONTINUE */
sawlacons = 1;
if (ISBSET(d->work, ca->to))
@@ -489,7 +488,7 @@ miss(struct vars * v, /* used only for debug flags */
dolacons = 1;
if (ca->to == cnfa->post)
ispost = 1;
- if (!cnfa->states[ca->to]->co)
+ if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
noprogress = 0;
FDEBUG(("%d :> %d\n", i, ca->to));
}
diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c
new file mode 100644
index 00000000000..6f91288e9df
--- /dev/null
+++ b/src/backend/regex/regprefix.c
@@ -0,0 +1,259 @@
+/*-------------------------------------------------------------------------
+ *
+ * regprefix.c
+ * Extract a common prefix, if any, from a compiled regex.
+ *
+ *
+ * Portions Copyright (c) 2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1998, 1999 Henry Spencer
+ *
+ * IDENTIFICATION
+ * src/backend/regex/regprefix.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "regex/regguts.h"
+
+
+/*
+ * forward declarations
+ */
+static int findprefix(struct cnfa * cnfa, struct colormap * cm,
+ chr *string, size_t *slength);
+
+
+/*
+ * pg_regprefix - get common prefix for regular expression
+ *
+ * Returns one of:
+ * REG_NOMATCH: there is no common prefix of strings matching the regex
+ * REG_PREFIX: there is a common prefix of strings matching the regex
+ * REG_EXACT: all strings satisfying the regex must match the same string
+ * or a REG_XXX error code
+ *
+ * In the non-failure cases, *string is set to a malloc'd string containing
+ * the common prefix or exact value, of length *slength (measured in chrs
+ * not bytes!).
+ *
+ * This function does not analyze all complex cases (such as lookahead
+ * constraints) exactly. Therefore it is possible that some strings matching
+ * the reported prefix or exact-match string do not satisfy the regex. But
+ * it should never be the case that a string satisfying the regex does not
+ * match the reported prefix or exact-match string.
+ */
+int
+pg_regprefix(regex_t *re,
+ chr **string,
+ size_t *slength)
+{
+ struct guts *g;
+ struct cnfa *cnfa;
+ int st;
+
+ /* sanity checks */
+ if (string == NULL || slength == NULL)
+ return REG_INVARG;
+ *string = NULL; /* initialize for failure cases */
+ *slength = 0;
+ if (re == NULL || re->re_magic != REMAGIC)
+ return REG_INVARG;
+ if (re->re_csize != sizeof(chr))
+ return REG_MIXED;
+
+ /* Initialize locale-dependent support */
+ pg_set_regex_collation(re->re_collation);
+
+ /* setup */
+ g = (struct guts *) re->re_guts;
+ if (g->info & REG_UIMPOSSIBLE)
+ return REG_NOMATCH;
+
+ /*
+ * This implementation considers only the search NFA for the topmost regex
+ * tree node. Therefore, constraints such as backrefs are not fully
+ * applied, which is allowed per the function's API spec.
+ */
+ assert(g->tree != NULL);
+ cnfa = &g->tree->cnfa;
+
+ /*
+ * Since a correct NFA should never contain any exit-free loops, it should
+ * not be possible for our traversal to return to a previously visited
+ * NFA state. Hence we need at most nstates chrs in the output string.
+ */
+ *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
+ if (*string == NULL)
+ return REG_ESPACE;
+
+ /* do it */
+ st = findprefix(cnfa, &g->cmap, *string, slength);
+
+ assert(*slength <= cnfa->nstates);
+
+ /* clean up */
+ if (st != REG_PREFIX && st != REG_EXACT)
+ {
+ FREE(*string);
+ *string = NULL;
+ *slength = 0;
+ }
+
+ return st;
+}
+
+/*
+ * findprefix - extract common prefix from cNFA
+ *
+ * Results are returned into the preallocated chr array string[], with
+ * *slength (which must be preset to zero) incremented for each chr.
+ */
+static int /* regprefix return code */
+findprefix(struct cnfa * cnfa,
+ struct colormap * cm,
+ chr *string,
+ size_t *slength)
+{
+ int st;
+ int nextst;
+ color thiscolor;
+ chr c;
+ struct carc *ca;
+
+ /*
+ * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
+ * anchored left. If we have both BOS and BOL, they must go to the
+ * same next state.
+ */
+ st = cnfa->pre;
+ nextst = -1;
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+ {
+ if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+ {
+ if (nextst == -1)
+ nextst = ca->to;
+ else if (nextst != ca->to)
+ return REG_NOMATCH;
+ }
+ else
+ return REG_NOMATCH;
+ }
+ if (nextst == -1)
+ return REG_NOMATCH;
+
+ /*
+ * Scan through successive states, stopping as soon as we find one with
+ * more than one acceptable transition character (either multiple colors
+ * on out-arcs, or a color with more than one member chr).
+ *
+ * We could find a state with multiple out-arcs that are all labeled with
+ * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
+ * In that case we add the chr "c" to the output string but then exit the
+ * loop with nextst == -1. This leaves a little bit on the table: if the
+ * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
+ * to the prefix. But chasing multiple parallel state chains doesn't seem
+ * worth the trouble.
+ */
+ do
+ {
+ st = nextst;
+ nextst = -1;
+ thiscolor = COLORLESS;
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+ {
+ /* We ignore lookahead constraints */
+ if (ca->co >= cnfa->ncolors)
+ continue;
+ /* We can also ignore BOS/BOL arcs */
+ if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+ continue;
+ /* ... but EOS/EOL arcs terminate the search */
+ if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
+ {
+ thiscolor = COLORLESS;
+ break;
+ }
+ if (thiscolor == COLORLESS)
+ {
+ /* First plain outarc */
+ thiscolor = ca->co;
+ nextst = ca->to;
+ }
+ else if (thiscolor == ca->co)
+ {
+ /* Another plain outarc for same color */
+ nextst = -1;
+ }
+ else
+ {
+ /* More than one plain outarc color terminates the search */
+ thiscolor = COLORLESS;
+ break;
+ }
+ }
+ /* Done if we didn't find exactly one color on plain outarcs */
+ if (thiscolor == COLORLESS)
+ break;
+ /* The color must be a singleton */
+ if (cm->cd[thiscolor].nchrs != 1)
+ break;
+
+ /*
+ * Identify the color's sole member chr and add it to the prefix
+ * string. In general the colormap data structure doesn't provide a
+ * way to find color member chrs, except by trying GETCOLOR() on each
+ * possible chr value, which won't do at all. However, for the cases
+ * we care about it should be sufficient to test the "firstchr" value,
+ * that is the first chr ever added to the color. There are cases
+ * where this might no longer be a member of the color (so we do need
+ * to test), but none of them are likely to arise for a character that
+ * is a member of a common prefix. If we do hit such a corner case,
+ * we just fall out without adding anything to the prefix string.
+ */
+ c = cm->cd[thiscolor].firstchr;
+ if (GETCOLOR(cm, c) != thiscolor)
+ break;
+
+ string[(*slength)++] = c;
+
+ /* Advance to next state, but only if we have a unique next state */
+ } while (nextst != -1);
+
+ /*
+ * If we ended at a state that only has EOS/EOL outarcs leading to the
+ * "post" state, then we have an exact-match string. Note this is true
+ * even if the string is of zero length.
+ */
+ nextst = -1;
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+ {
+ if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
+ {
+ if (nextst == -1)
+ nextst = ca->to;
+ else if (nextst != ca->to)
+ {
+ nextst = -1;
+ break;
+ }
+ }
+ else
+ {
+ nextst = -1;
+ break;
+ }
+ }
+ if (nextst == cnfa->post)
+ return REG_EXACT;
+
+ /*
+ * Otherwise, if we were unable to identify any prefix characters, say
+ * NOMATCH --- the pattern is anchored left, but doesn't specify any
+ * particular first character.
+ */
+ if (*slength > 0)
+ return REG_PREFIX;
+
+ return REG_NOMATCH;
+}