aboutsummaryrefslogtreecommitdiff
path: root/src/backend/regex
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/regex')
-rw-r--r--src/backend/regex/Makefile2
-rw-r--r--src/backend/regex/README4
-rw-r--r--src/backend/regex/regc_color.c11
-rw-r--r--src/backend/regex/regprefix.c259
4 files changed, 273 insertions, 3 deletions
diff --git a/src/backend/regex/Makefile b/src/backend/regex/Makefile
index 21e7fa5329b..74a4c0c89d8 100644
--- a/src/backend/regex/Makefile
+++ b/src/backend/regex/Makefile
@@ -12,7 +12,7 @@ subdir = src/backend/regex
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
-OBJS = regcomp.o regerror.o regexec.o regfree.o
+OBJS = regcomp.o regerror.o regexec.o regfree.o regprefix.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/regex/README b/src/backend/regex/README
index 89ba6a62ea2..c5d21e8c99d 100644
--- a/src/backend/regex/README
+++ b/src/backend/regex/README
@@ -7,12 +7,13 @@ So this file is an attempt to reverse-engineer some docs.
General source-file layout
--------------------------
-There are four separately-compilable source files, each exposing exactly
+There are five separately-compilable source files, each exposing exactly
one exported function:
regcomp.c: pg_regcomp
regexec.c: pg_regexec
regerror.c: pg_regerror
regfree.c: pg_regfree
+ regprefix.c: pg_regprefix
(The pg_ prefixes were added by the Postgres project to distinguish this
library version from any similar one that might be present on a particular
system. They'd need to be removed or replaced in any standalone version
@@ -44,6 +45,7 @@ regexec.c Top-level regex execution code
rege_dfa.c DFA creation and execution
regerror.c pg_regerror: generate text for a regex error code
regfree.c pg_regfree: API to free a no-longer-needed regex_t
+regprefix.c Code for extracting a common prefix from a regex_t
The locale-specific code is concerned primarily with case-folding and with
expanding locale-specific character classes, such as [[:alnum:]]. It
diff --git a/src/backend/regex/regc_color.c b/src/backend/regex/regc_color.c
index 2aeb861d976..1c60566fbf5 100644
--- a/src/backend/regex/regc_color.c
+++ b/src/backend/regex/regc_color.c
@@ -66,8 +66,9 @@ initcm(struct vars * v,
cd = cm->cd; /* cm->cd[WHITE] */
cd->sub = NOSUB;
cd->arcs = NULL;
- cd->flags = 0;
+ cd->firstchr = CHR_MIN;
cd->nchrs = CHR_MAX - CHR_MIN + 1;
+ cd->flags = 0;
/* upper levels of tree */
for (t = &cm->tree[0], j = NBYTS - 1; j > 0; t = nextt, j--)
@@ -272,6 +273,7 @@ newcolor(struct colormap * cm)
cd->nchrs = 0;
cd->sub = NOSUB;
cd->arcs = NULL;
+ cd->firstchr = CHR_MIN; /* in case never set otherwise */
cd->flags = 0;
cd->block = NULL;
@@ -371,6 +373,8 @@ subcolor(struct colormap * cm, chr c)
if (co == sco) /* already in an open subcolor */
return co; /* rest is redundant */
cm->cd[co].nchrs--;
+ if (cm->cd[sco].nchrs == 0)
+ cm->cd[sco].firstchr = c;
cm->cd[sco].nchrs++;
setcolor(cm, c, sco);
return sco;
@@ -438,6 +442,11 @@ subrange(struct vars * v,
/*
* subblock - allocate new subcolors for one tree block of chrs, fill in arcs
+ *
+ * Note: subcolors that are created during execution of this function
+ * will not be given a useful value of firstchr; it'll be left as CHR_MIN.
+ * For the current usage of firstchr in pg_regprefix, this does not matter
+ * because such subcolors won't occur in the common prefix of a regex.
*/
static void
subblock(struct vars * v,
diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c
new file mode 100644
index 00000000000..6f91288e9df
--- /dev/null
+++ b/src/backend/regex/regprefix.c
@@ -0,0 +1,259 @@
+/*-------------------------------------------------------------------------
+ *
+ * regprefix.c
+ * Extract a common prefix, if any, from a compiled regex.
+ *
+ *
+ * Portions Copyright (c) 2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1998, 1999 Henry Spencer
+ *
+ * IDENTIFICATION
+ * src/backend/regex/regprefix.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "regex/regguts.h"
+
+
+/*
+ * forward declarations
+ */
+static int findprefix(struct cnfa * cnfa, struct colormap * cm,
+ chr *string, size_t *slength);
+
+
+/*
+ * pg_regprefix - get common prefix for regular expression
+ *
+ * Returns one of:
+ * REG_NOMATCH: there is no common prefix of strings matching the regex
+ * REG_PREFIX: there is a common prefix of strings matching the regex
+ * REG_EXACT: all strings satisfying the regex must match the same string
+ * or a REG_XXX error code
+ *
+ * In the non-failure cases, *string is set to a malloc'd string containing
+ * the common prefix or exact value, of length *slength (measured in chrs
+ * not bytes!).
+ *
+ * This function does not analyze all complex cases (such as lookahead
+ * constraints) exactly. Therefore it is possible that some strings matching
+ * the reported prefix or exact-match string do not satisfy the regex. But
+ * it should never be the case that a string satisfying the regex does not
+ * match the reported prefix or exact-match string.
+ */
+int
+pg_regprefix(regex_t *re,
+ chr **string,
+ size_t *slength)
+{
+ struct guts *g;
+ struct cnfa *cnfa;
+ int st;
+
+ /* sanity checks */
+ if (string == NULL || slength == NULL)
+ return REG_INVARG;
+ *string = NULL; /* initialize for failure cases */
+ *slength = 0;
+ if (re == NULL || re->re_magic != REMAGIC)
+ return REG_INVARG;
+ if (re->re_csize != sizeof(chr))
+ return REG_MIXED;
+
+ /* Initialize locale-dependent support */
+ pg_set_regex_collation(re->re_collation);
+
+ /* setup */
+ g = (struct guts *) re->re_guts;
+ if (g->info & REG_UIMPOSSIBLE)
+ return REG_NOMATCH;
+
+ /*
+ * This implementation considers only the search NFA for the topmost regex
+ * tree node. Therefore, constraints such as backrefs are not fully
+ * applied, which is allowed per the function's API spec.
+ */
+ assert(g->tree != NULL);
+ cnfa = &g->tree->cnfa;
+
+ /*
+ * Since a correct NFA should never contain any exit-free loops, it should
+ * not be possible for our traversal to return to a previously visited
+ * NFA state. Hence we need at most nstates chrs in the output string.
+ */
+ *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
+ if (*string == NULL)
+ return REG_ESPACE;
+
+ /* do it */
+ st = findprefix(cnfa, &g->cmap, *string, slength);
+
+ assert(*slength <= cnfa->nstates);
+
+ /* clean up */
+ if (st != REG_PREFIX && st != REG_EXACT)
+ {
+ FREE(*string);
+ *string = NULL;
+ *slength = 0;
+ }
+
+ return st;
+}
+
+/*
+ * findprefix - extract common prefix from cNFA
+ *
+ * Results are returned into the preallocated chr array string[], with
+ * *slength (which must be preset to zero) incremented for each chr.
+ */
+static int /* regprefix return code */
+findprefix(struct cnfa * cnfa,
+ struct colormap * cm,
+ chr *string,
+ size_t *slength)
+{
+ int st;
+ int nextst;
+ color thiscolor;
+ chr c;
+ struct carc *ca;
+
+ /*
+ * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
+ * anchored left. If we have both BOS and BOL, they must go to the
+ * same next state.
+ */
+ st = cnfa->pre;
+ nextst = -1;
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+ {
+ if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+ {
+ if (nextst == -1)
+ nextst = ca->to;
+ else if (nextst != ca->to)
+ return REG_NOMATCH;
+ }
+ else
+ return REG_NOMATCH;
+ }
+ if (nextst == -1)
+ return REG_NOMATCH;
+
+ /*
+ * Scan through successive states, stopping as soon as we find one with
+ * more than one acceptable transition character (either multiple colors
+ * on out-arcs, or a color with more than one member chr).
+ *
+ * We could find a state with multiple out-arcs that are all labeled with
+ * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
+ * In that case we add the chr "c" to the output string but then exit the
+ * loop with nextst == -1. This leaves a little bit on the table: if the
+ * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
+ * to the prefix. But chasing multiple parallel state chains doesn't seem
+ * worth the trouble.
+ */
+ do
+ {
+ st = nextst;
+ nextst = -1;
+ thiscolor = COLORLESS;
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+ {
+ /* We ignore lookahead constraints */
+ if (ca->co >= cnfa->ncolors)
+ continue;
+ /* We can also ignore BOS/BOL arcs */
+ if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+ continue;
+ /* ... but EOS/EOL arcs terminate the search */
+ if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
+ {
+ thiscolor = COLORLESS;
+ break;
+ }
+ if (thiscolor == COLORLESS)
+ {
+ /* First plain outarc */
+ thiscolor = ca->co;
+ nextst = ca->to;
+ }
+ else if (thiscolor == ca->co)
+ {
+ /* Another plain outarc for same color */
+ nextst = -1;
+ }
+ else
+ {
+ /* More than one plain outarc color terminates the search */
+ thiscolor = COLORLESS;
+ break;
+ }
+ }
+ /* Done if we didn't find exactly one color on plain outarcs */
+ if (thiscolor == COLORLESS)
+ break;
+ /* The color must be a singleton */
+ if (cm->cd[thiscolor].nchrs != 1)
+ break;
+
+ /*
+ * Identify the color's sole member chr and add it to the prefix
+ * string. In general the colormap data structure doesn't provide a
+ * way to find color member chrs, except by trying GETCOLOR() on each
+ * possible chr value, which won't do at all. However, for the cases
+ * we care about it should be sufficient to test the "firstchr" value,
+ * that is the first chr ever added to the color. There are cases
+ * where this might no longer be a member of the color (so we do need
+ * to test), but none of them are likely to arise for a character that
+ * is a member of a common prefix. If we do hit such a corner case,
+ * we just fall out without adding anything to the prefix string.
+ */
+ c = cm->cd[thiscolor].firstchr;
+ if (GETCOLOR(cm, c) != thiscolor)
+ break;
+
+ string[(*slength)++] = c;
+
+ /* Advance to next state, but only if we have a unique next state */
+ } while (nextst != -1);
+
+ /*
+ * If we ended at a state that only has EOS/EOL outarcs leading to the
+ * "post" state, then we have an exact-match string. Note this is true
+ * even if the string is of zero length.
+ */
+ nextst = -1;
+ for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+ {
+ if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
+ {
+ if (nextst == -1)
+ nextst = ca->to;
+ else if (nextst != ca->to)
+ {
+ nextst = -1;
+ break;
+ }
+ }
+ else
+ {
+ nextst = -1;
+ break;
+ }
+ }
+ if (nextst == cnfa->post)
+ return REG_EXACT;
+
+ /*
+ * Otherwise, if we were unable to identify any prefix characters, say
+ * NOMATCH --- the pattern is anchored left, but doesn't specify any
+ * particular first character.
+ */
+ if (*slength > 0)
+ return REG_PREFIX;
+
+ return REG_NOMATCH;
+}