diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2012-07-10 18:00:39 -0400 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2012-07-10 18:00:39 -0400 |
commit | 18c8dc32ce6759a8837f01e3dd0036cd19ee8683 (patch) | |
tree | d2d058f0f362cf22af5856a4e20cc7f814835f87 /src/backend/regex | |
parent | f12960d8c9ce454a37c2082549b7766ce36627d2 (diff) | |
download | postgresql-18c8dc32ce6759a8837f01e3dd0036cd19ee8683.tar.gz postgresql-18c8dc32ce6759a8837f01e3dd0036cd19ee8683.zip |
Back-patch fix for extraction of fixed prefixes from regular expressions.
Back-patch of commits 628cbb50ba80c83917b07a7609ddec12cda172d0 and
c6aae3042be5249e672b731ebeb21875b5343010. This has been broken since
7.3, so back-patch to all supported branches.
Diffstat (limited to 'src/backend/regex')
-rw-r--r-- | src/backend/regex/Makefile | 2 | ||||
-rw-r--r-- | src/backend/regex/README | 4 | ||||
-rw-r--r-- | src/backend/regex/regc_color.c | 11 | ||||
-rw-r--r-- | src/backend/regex/regc_nfa.c | 34 | ||||
-rw-r--r-- | src/backend/regex/regcomp.c | 2 | ||||
-rw-r--r-- | src/backend/regex/rege_dfa.c | 11 | ||||
-rw-r--r-- | src/backend/regex/regprefix.c | 259 |
7 files changed, 296 insertions, 27 deletions
diff --git a/src/backend/regex/Makefile b/src/backend/regex/Makefile index 21e7fa5329b..74a4c0c89d8 100644 --- a/src/backend/regex/Makefile +++ b/src/backend/regex/Makefile @@ -12,7 +12,7 @@ subdir = src/backend/regex top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -OBJS = regcomp.o regerror.o regexec.o regfree.o +OBJS = regcomp.o regerror.o regexec.o regfree.o regprefix.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/regex/README b/src/backend/regex/README index 89ba6a62ea2..c5d21e8c99d 100644 --- a/src/backend/regex/README +++ b/src/backend/regex/README @@ -7,12 +7,13 @@ So this file is an attempt to reverse-engineer some docs. General source-file layout -------------------------- -There are four separately-compilable source files, each exposing exactly +There are five separately-compilable source files, each exposing exactly one exported function: regcomp.c: pg_regcomp regexec.c: pg_regexec regerror.c: pg_regerror regfree.c: pg_regfree + regprefix.c: pg_regprefix (The pg_ prefixes were added by the Postgres project to distinguish this library version from any similar one that might be present on a particular system. They'd need to be removed or replaced in any standalone version @@ -44,6 +45,7 @@ regexec.c Top-level regex execution code rege_dfa.c DFA creation and execution regerror.c pg_regerror: generate text for a regex error code regfree.c pg_regfree: API to free a no-longer-needed regex_t +regprefix.c Code for extracting a common prefix from a regex_t The locale-specific code is concerned primarily with case-folding and with expanding locale-specific character classes, such as [[:alnum:]]. It diff --git a/src/backend/regex/regc_color.c b/src/backend/regex/regc_color.c index 2aeb861d976..1c60566fbf5 100644 --- a/src/backend/regex/regc_color.c +++ b/src/backend/regex/regc_color.c @@ -66,8 +66,9 @@ initcm(struct vars * v, cd = cm->cd; /* cm->cd[WHITE] */ cd->sub = NOSUB; cd->arcs = NULL; - cd->flags = 0; + cd->firstchr = CHR_MIN; cd->nchrs = CHR_MAX - CHR_MIN + 1; + cd->flags = 0; /* upper levels of tree */ for (t = &cm->tree[0], j = NBYTS - 1; j > 0; t = nextt, j--) @@ -272,6 +273,7 @@ newcolor(struct colormap * cm) cd->nchrs = 0; cd->sub = NOSUB; cd->arcs = NULL; + cd->firstchr = CHR_MIN; /* in case never set otherwise */ cd->flags = 0; cd->block = NULL; @@ -371,6 +373,8 @@ subcolor(struct colormap * cm, chr c) if (co == sco) /* already in an open subcolor */ return co; /* rest is redundant */ cm->cd[co].nchrs--; + if (cm->cd[sco].nchrs == 0) + cm->cd[sco].firstchr = c; cm->cd[sco].nchrs++; setcolor(cm, c, sco); return sco; @@ -438,6 +442,11 @@ subrange(struct vars * v, /* * subblock - allocate new subcolors for one tree block of chrs, fill in arcs + * + * Note: subcolors that are created during execution of this function + * will not be given a useful value of firstchr; it'll be left as CHR_MIN. + * For the current usage of firstchr in pg_regprefix, this does not matter + * because such subcolors won't occur in the common prefix of a regex. */ static void subblock(struct vars * v, diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c index 66a361ee2ff..085842c92b7 100644 --- a/src/backend/regex/regc_nfa.c +++ b/src/backend/regex/regc_nfa.c @@ -1330,14 +1330,16 @@ compact(struct nfa * nfa, for (s = nfa->states; s != NULL; s = s->next) { nstates++; - narcs += 1 + s->nouts + 1; - /* 1 as a fake for flags, nouts for arcs, 1 as endmarker */ + narcs += s->nouts + 1; /* need one extra for endmarker */ } + cnfa->stflags = (char *) MALLOC(nstates * sizeof(char)); cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *)); cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc)); - if (cnfa->states == NULL || cnfa->arcs == NULL) + if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL) { + if (cnfa->stflags != NULL) + FREE(cnfa->stflags); if (cnfa->states != NULL) FREE(cnfa->states); if (cnfa->arcs != NULL) @@ -1359,9 +1361,8 @@ compact(struct nfa * nfa, for (s = nfa->states; s != NULL; s = s->next) { assert((size_t) s->no < nstates); + cnfa->stflags[s->no] = 0; cnfa->states[s->no] = ca; - ca->co = 0; /* clear and skip flags "arc" */ - ca++; first = ca; for (a = s->outs; a != NULL; a = a->outchain) switch (a->type) @@ -1392,8 +1393,8 @@ compact(struct nfa * nfa, /* mark no-progress states */ for (a = nfa->pre->outs; a != NULL; a = a->outchain) - cnfa->states[a->to->no]->co = 1; - cnfa->states[nfa->pre->no]->co = 1; + cnfa->stflags[a->to->no] = CNFA_NOPROGRESS; + cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS; } /* @@ -1433,6 +1434,7 @@ freecnfa(struct cnfa * cnfa) { assert(cnfa->nstates != 0); /* not empty already */ cnfa->nstates = 0; + FREE(cnfa->stflags); FREE(cnfa->states); FREE(cnfa->arcs); } @@ -1617,7 +1619,7 @@ dumpcnfa(struct cnfa * cnfa, fprintf(f, ", haslacons"); fprintf(f, "\n"); for (st = 0; st < cnfa->nstates; st++) - dumpcstate(st, cnfa->states[st], cnfa, f); + dumpcstate(st, cnfa, f); fflush(f); } #endif @@ -1629,22 +1631,20 @@ dumpcnfa(struct cnfa * cnfa, */ static void dumpcstate(int st, - struct carc * ca, struct cnfa * cnfa, FILE *f) { - int i; + struct carc * ca; int pos; - fprintf(f, "%d%s", st, (ca[0].co) ? ":" : "."); + fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : "."); pos = 1; - for (i = 1; ca[i].co != COLORLESS; i++) + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) { - if (ca[i].co < cnfa->ncolors) - fprintf(f, "\t[%ld]->%d", (long) ca[i].co, ca[i].to); + if (ca->co < cnfa->ncolors) + fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to); else - fprintf(f, "\t:%ld:->%d", (long) ca[i].co - cnfa->ncolors, - ca[i].to); + fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to); if (pos == 5) { fprintf(f, "\n"); @@ -1653,7 +1653,7 @@ dumpcstate(int st, else pos++; } - if (i == 1 || pos != 1) + if (ca == cnfa->states[st] || pos != 1) fprintf(f, "\n"); fflush(f); } diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 57055f04abb..ceb6f0f8737 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -162,7 +162,7 @@ static void dumparcs(struct state *, FILE *); static int dumprarcs(struct arc *, struct state *, FILE *, int); static void dumparc(struct arc *, struct state *, FILE *); static void dumpcnfa(struct cnfa *, FILE *); -static void dumpcstate(int, struct carc *, struct cnfa *, FILE *); +static void dumpcstate(int, struct cnfa *, FILE *); #endif /* === regc_cvec.c === */ static struct cvec *newcvec(int, int); diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c index da7a0bf402f..7a7ba5b89cf 100644 --- a/src/backend/regex/rege_dfa.c +++ b/src/backend/regex/rege_dfa.c @@ -457,14 +457,14 @@ miss(struct vars * v, /* used only for debug flags */ gotstate = 0; for (i = 0; i < d->nstates; i++) if (ISBSET(css->states, i)) - for (ca = cnfa->states[i] + 1; ca->co != COLORLESS; ca++) + for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) if (ca->co == co) { BSET(d->work, ca->to); gotstate = 1; if (ca->to == cnfa->post) ispost = 1; - if (!cnfa->states[ca->to]->co) + if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS)) noprogress = 0; FDEBUG(("%d -> %d\n", i, ca->to)); } @@ -475,10 +475,9 @@ miss(struct vars * v, /* used only for debug flags */ dolacons = 0; for (i = 0; i < d->nstates; i++) if (ISBSET(d->work, i)) - for (ca = cnfa->states[i] + 1; ca->co != COLORLESS; - ca++) + for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) { - if (ca->co <= cnfa->ncolors) + if (ca->co < cnfa->ncolors) continue; /* NOTE CONTINUE */ sawlacons = 1; if (ISBSET(d->work, ca->to)) @@ -489,7 +488,7 @@ miss(struct vars * v, /* used only for debug flags */ dolacons = 1; if (ca->to == cnfa->post) ispost = 1; - if (!cnfa->states[ca->to]->co) + if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS)) noprogress = 0; FDEBUG(("%d :> %d\n", i, ca->to)); } diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c new file mode 100644 index 00000000000..6f91288e9df --- /dev/null +++ b/src/backend/regex/regprefix.c @@ -0,0 +1,259 @@ +/*------------------------------------------------------------------------- + * + * regprefix.c + * Extract a common prefix, if any, from a compiled regex. + * + * + * Portions Copyright (c) 2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1998, 1999 Henry Spencer + * + * IDENTIFICATION + * src/backend/regex/regprefix.c + * + *------------------------------------------------------------------------- + */ + +#include "regex/regguts.h" + + +/* + * forward declarations + */ +static int findprefix(struct cnfa * cnfa, struct colormap * cm, + chr *string, size_t *slength); + + +/* + * pg_regprefix - get common prefix for regular expression + * + * Returns one of: + * REG_NOMATCH: there is no common prefix of strings matching the regex + * REG_PREFIX: there is a common prefix of strings matching the regex + * REG_EXACT: all strings satisfying the regex must match the same string + * or a REG_XXX error code + * + * In the non-failure cases, *string is set to a malloc'd string containing + * the common prefix or exact value, of length *slength (measured in chrs + * not bytes!). + * + * This function does not analyze all complex cases (such as lookahead + * constraints) exactly. Therefore it is possible that some strings matching + * the reported prefix or exact-match string do not satisfy the regex. But + * it should never be the case that a string satisfying the regex does not + * match the reported prefix or exact-match string. + */ +int +pg_regprefix(regex_t *re, + chr **string, + size_t *slength) +{ + struct guts *g; + struct cnfa *cnfa; + int st; + + /* sanity checks */ + if (string == NULL || slength == NULL) + return REG_INVARG; + *string = NULL; /* initialize for failure cases */ + *slength = 0; + if (re == NULL || re->re_magic != REMAGIC) + return REG_INVARG; + if (re->re_csize != sizeof(chr)) + return REG_MIXED; + + /* Initialize locale-dependent support */ + pg_set_regex_collation(re->re_collation); + + /* setup */ + g = (struct guts *) re->re_guts; + if (g->info & REG_UIMPOSSIBLE) + return REG_NOMATCH; + + /* + * This implementation considers only the search NFA for the topmost regex + * tree node. Therefore, constraints such as backrefs are not fully + * applied, which is allowed per the function's API spec. + */ + assert(g->tree != NULL); + cnfa = &g->tree->cnfa; + + /* + * Since a correct NFA should never contain any exit-free loops, it should + * not be possible for our traversal to return to a previously visited + * NFA state. Hence we need at most nstates chrs in the output string. + */ + *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr)); + if (*string == NULL) + return REG_ESPACE; + + /* do it */ + st = findprefix(cnfa, &g->cmap, *string, slength); + + assert(*slength <= cnfa->nstates); + + /* clean up */ + if (st != REG_PREFIX && st != REG_EXACT) + { + FREE(*string); + *string = NULL; + *slength = 0; + } + + return st; +} + +/* + * findprefix - extract common prefix from cNFA + * + * Results are returned into the preallocated chr array string[], with + * *slength (which must be preset to zero) incremented for each chr. + */ +static int /* regprefix return code */ +findprefix(struct cnfa * cnfa, + struct colormap * cm, + chr *string, + size_t *slength) +{ + int st; + int nextst; + color thiscolor; + chr c; + struct carc *ca; + + /* + * The "pre" state must have only BOS/BOL outarcs, else pattern isn't + * anchored left. If we have both BOS and BOL, they must go to the + * same next state. + */ + st = cnfa->pre; + nextst = -1; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) + { + if (nextst == -1) + nextst = ca->to; + else if (nextst != ca->to) + return REG_NOMATCH; + } + else + return REG_NOMATCH; + } + if (nextst == -1) + return REG_NOMATCH; + + /* + * Scan through successive states, stopping as soon as we find one with + * more than one acceptable transition character (either multiple colors + * on out-arcs, or a color with more than one member chr). + * + * We could find a state with multiple out-arcs that are all labeled with + * the same singleton color; this comes from patterns like "^ab(cde|cxy)". + * In that case we add the chr "c" to the output string but then exit the + * loop with nextst == -1. This leaves a little bit on the table: if the + * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added + * to the prefix. But chasing multiple parallel state chains doesn't seem + * worth the trouble. + */ + do + { + st = nextst; + nextst = -1; + thiscolor = COLORLESS; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + /* We ignore lookahead constraints */ + if (ca->co >= cnfa->ncolors) + continue; + /* We can also ignore BOS/BOL arcs */ + if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) + continue; + /* ... but EOS/EOL arcs terminate the search */ + if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) + { + thiscolor = COLORLESS; + break; + } + if (thiscolor == COLORLESS) + { + /* First plain outarc */ + thiscolor = ca->co; + nextst = ca->to; + } + else if (thiscolor == ca->co) + { + /* Another plain outarc for same color */ + nextst = -1; + } + else + { + /* More than one plain outarc color terminates the search */ + thiscolor = COLORLESS; + break; + } + } + /* Done if we didn't find exactly one color on plain outarcs */ + if (thiscolor == COLORLESS) + break; + /* The color must be a singleton */ + if (cm->cd[thiscolor].nchrs != 1) + break; + + /* + * Identify the color's sole member chr and add it to the prefix + * string. In general the colormap data structure doesn't provide a + * way to find color member chrs, except by trying GETCOLOR() on each + * possible chr value, which won't do at all. However, for the cases + * we care about it should be sufficient to test the "firstchr" value, + * that is the first chr ever added to the color. There are cases + * where this might no longer be a member of the color (so we do need + * to test), but none of them are likely to arise for a character that + * is a member of a common prefix. If we do hit such a corner case, + * we just fall out without adding anything to the prefix string. + */ + c = cm->cd[thiscolor].firstchr; + if (GETCOLOR(cm, c) != thiscolor) + break; + + string[(*slength)++] = c; + + /* Advance to next state, but only if we have a unique next state */ + } while (nextst != -1); + + /* + * If we ended at a state that only has EOS/EOL outarcs leading to the + * "post" state, then we have an exact-match string. Note this is true + * even if the string is of zero length. + */ + nextst = -1; + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) + { + if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) + { + if (nextst == -1) + nextst = ca->to; + else if (nextst != ca->to) + { + nextst = -1; + break; + } + } + else + { + nextst = -1; + break; + } + } + if (nextst == cnfa->post) + return REG_EXACT; + + /* + * Otherwise, if we were unable to identify any prefix characters, say + * NOMATCH --- the pattern is anchored left, but doesn't specify any + * particular first character. + */ + if (*slength > 0) + return REG_PREFIX; + + return REG_NOMATCH; +} |