aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/regex/README8
-rw-r--r--src/backend/regex/re_syntax.n15
-rw-r--r--src/backend/regex/regc_lex.c29
-rw-r--r--src/backend/regex/regc_nfa.c43
-rw-r--r--src/backend/regex/regcomp.c152
-rw-r--r--src/backend/regex/rege_dfa.c162
-rw-r--r--src/backend/regex/regexec.c104
-rw-r--r--src/backend/regex/regexport.c2
-rw-r--r--src/backend/regex/regprefix.c2
-rw-r--r--src/include/regex/regex.h2
-rw-r--r--src/include/regex/regguts.h20
-rw-r--r--src/test/regress/expected/regex.out169
-rw-r--r--src/test/regress/sql/regex.sql35
13 files changed, 673 insertions, 70 deletions
diff --git a/src/backend/regex/README b/src/backend/regex/README
index 5c24d3dfe9d..6c9f48315e3 100644
--- a/src/backend/regex/README
+++ b/src/backend/regex/README
@@ -332,10 +332,10 @@ The possible arc types are:
as "$0->to_state" or "$1->to_state" for end-of-string and end-of-line
constraints respectively.
- LACON constraints, which represent "(?=re)" and "(?!re)" constraints,
- i.e. the input starting at this point must match (or not match) a
- given sub-RE, but the matching input is not consumed. These are
- dumped as ":subtree_number:->to_state".
+ LACON constraints, which represent "(?=re)", "(?!re)", "(?<=re)", and
+ "(?<!re)" constraints, i.e. the input starting/ending at this point must
+ match (or not match) a given sub-RE, but the matching input is not
+ consumed. These are dumped as ":subtree_number:->to_state".
If you see anything else (especially any question marks) in the display of
an arc, it's dumpnfa() trying to tell you that there's something fishy
diff --git a/src/backend/regex/re_syntax.n b/src/backend/regex/re_syntax.n
index f37bb85abdb..4621bfc25f4 100644
--- a/src/backend/regex/re_syntax.n
+++ b/src/backend/regex/re_syntax.n
@@ -196,10 +196,18 @@ where a substring matching \fIre\fR begins
\fB(?!\fIre\fB)\fR
\fInegative lookahead\fR (AREs only), matches at any point
where no substring matching \fIre\fR begins
+.TP
+\fB(?<=\fIre\fB)\fR
+\fIpositive lookbehind\fR (AREs only), matches at any point
+where a substring matching \fIre\fR ends
+.TP
+\fB(?<!\fIre\fB)\fR
+\fInegative lookbehind\fR (AREs only), matches at any point
+where no substring matching \fIre\fR ends
.RE
.PP
-The lookahead constraints may not contain back references (see later),
-and all parentheses within them are considered non-capturing.
+Lookahead and lookbehind constraints may not contain back references
+(see later), and all parentheses within them are considered non-capturing.
.PP
An RE may not end with `\fB\e\fR'.
@@ -856,7 +864,8 @@ Incompatibilities of note include `\fB\eb\fR', `\fB\eB\fR',
the lack of special treatment for a trailing newline,
the addition of complemented bracket expressions to the things
affected by newline-sensitive matching,
-the restrictions on parentheses and back references in lookahead constraints,
+the restrictions on parentheses and back references in lookahead/lookbehind
+constraints,
and the longest/shortest-match (rather than first-match) matching semantics.
.PP
The matching rules for REs containing both normal and non-greedy quantifiers
diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c
index f6ed9f09ea4..bfd9dcd2a49 100644
--- a/src/backend/regex/regc_lex.c
+++ b/src/backend/regex/regc_lex.c
@@ -582,6 +582,8 @@ next(struct vars * v)
{
NOTE(REG_UNONPOSIX);
v->now++;
+ if (ATEOS())
+ FAILW(REG_BADRPT);
switch (*v->now++)
{
case CHR(':'): /* non-capturing paren */
@@ -596,12 +598,31 @@ next(struct vars * v)
return next(v);
break;
case CHR('='): /* positive lookahead */
- NOTE(REG_ULOOKAHEAD);
- RETV(LACON, 1);
+ NOTE(REG_ULOOKAROUND);
+ RETV(LACON, LATYPE_AHEAD_POS);
break;
case CHR('!'): /* negative lookahead */
- NOTE(REG_ULOOKAHEAD);
- RETV(LACON, 0);
+ NOTE(REG_ULOOKAROUND);
+ RETV(LACON, LATYPE_AHEAD_NEG);
+ break;
+ case CHR('<'):
+ if (ATEOS())
+ FAILW(REG_BADRPT);
+ switch (*v->now++)
+ {
+ case CHR('='): /* positive lookbehind */
+ NOTE(REG_ULOOKAROUND);
+ RETV(LACON, LATYPE_BEHIND_POS);
+ break;
+ case CHR('!'): /* negative lookbehind */
+ NOTE(REG_ULOOKAROUND);
+ RETV(LACON, LATYPE_BEHIND_NEG);
+ break;
+ default:
+ FAILW(REG_BADRPT);
+ break;
+ }
+ assert(NOTREACHED);
break;
default:
FAILW(REG_BADRPT);
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c
index 6f04321cd35..cd9a3239bd3 100644
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@@ -1349,6 +1349,49 @@ cleartraverse(struct nfa * nfa,
}
/*
+ * single_color_transition - does getting from s1 to s2 cross one PLAIN arc?
+ *
+ * If traversing from s1 to s2 requires a single PLAIN match (possibly of any
+ * of a set of colors), return a state whose outarc list contains only PLAIN
+ * arcs of those color(s). Otherwise return NULL.
+ *
+ * This is used before optimizing the NFA, so there may be EMPTY arcs, which
+ * we should ignore; the possibility of an EMPTY is why the result state could
+ * be different from s1.
+ *
+ * It's worth troubling to handle multiple parallel PLAIN arcs here because a
+ * bracket construct such as [abc] might yield either one or several parallel
+ * PLAIN arcs depending on earlier atoms in the expression. We'd rather that
+ * that implementation detail not create user-visible performance differences.
+ */
+static struct state *
+single_color_transition(struct state * s1, struct state * s2)
+{
+ struct arc *a;
+
+ /* Ignore leading EMPTY arc, if any */
+ if (s1->nouts == 1 && s1->outs->type == EMPTY)
+ s1 = s1->outs->to;
+ /* Likewise for any trailing EMPTY arc */
+ if (s2->nins == 1 && s2->ins->type == EMPTY)
+ s2 = s2->ins->from;
+ /* Perhaps we could have a single-state loop in between, if so reject */
+ if (s1 == s2)
+ return NULL;
+ /* s1 must have at least one outarc... */
+ if (s1->outs == NULL)
+ return NULL;
+ /* ... and they must all be PLAIN arcs to s2 */
+ for (a = s1->outs; a != NULL; a = a->outchain)
+ {
+ if (a->type != PLAIN || a->to != s2)
+ return NULL;
+ }
+ /* OK, return s1 as the possessor of the relevant outarcs */
+ return s1;
+}
+
+/*
* specialcolors - fill in special colors for an NFA
*/
static void
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index b733bc7824e..aa759c26486 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -57,6 +57,8 @@ static const chr *scanplain(struct vars *);
static void onechr(struct vars *, chr, struct state *, struct state *);
static void dovec(struct vars *, struct cvec *, struct state *, struct state *);
static void wordchrs(struct vars *);
+static void processlacon(struct vars *, struct state *, struct state *, int,
+ struct state *, struct state *);
static struct subre *subre(struct vars *, int, int, struct state *, struct state *);
static void freesubre(struct vars *, struct subre *);
static void freesrnode(struct vars *, struct subre *);
@@ -65,7 +67,7 @@ static int numst(struct subre *, int);
static void markst(struct subre *);
static void cleanst(struct vars *);
static long nfatree(struct vars *, struct subre *, FILE *);
-static long nfanode(struct vars *, struct subre *, FILE *);
+static long nfanode(struct vars *, struct subre *, int, FILE *);
static int newlacon(struct vars *, struct state *, struct state *, int);
static void freelacons(struct subre *, int);
static void rfree(regex_t *);
@@ -146,6 +148,7 @@ static void deltraverse(struct nfa *, struct state *, struct state *);
static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *);
static void duptraverse(struct nfa *, struct state *, struct state *);
static void cleartraverse(struct nfa *, struct state *);
+static struct state *single_color_transition(struct state *, struct state *);
static void specialcolors(struct nfa *);
static long optimize(struct nfa *, FILE *);
static void pullback(struct nfa *, FILE *);
@@ -245,8 +248,9 @@ struct vars
int ntree; /* number of tree nodes, plus one */
struct cvec *cv; /* interface cvec */
struct cvec *cv2; /* utility cvec */
- struct subre *lacons; /* lookahead-constraint vector */
- int nlacons; /* size of lacons */
+ struct subre *lacons; /* lookaround-constraint vector */
+ int nlacons; /* size of lacons[]; note that only slots
+ * numbered 1 .. nlacons-1 are used */
size_t spaceused; /* approx. space used for compilation */
};
@@ -277,7 +281,7 @@ struct vars
#define CCLASS 'C' /* start of [: */
#define END 'X' /* end of [. [= [: */
#define RANGE 'R' /* - within [] which might be range delim. */
-#define LACON 'L' /* lookahead constraint subRE */
+#define LACON 'L' /* lookaround constraint subRE */
#define AHEAD 'a' /* color-lookahead arc */
#define BEHIND 'r' /* color-lookbehind arc */
#define WBDRY 'w' /* word boundary constraint */
@@ -432,11 +436,15 @@ pg_regcomp(regex_t *re,
assert(v->nlacons == 0 || v->lacons != NULL);
for (i = 1; i < v->nlacons; i++)
{
+ struct subre *lasub = &v->lacons[i];
+
#ifdef REG_DEBUG
if (debug != NULL)
fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
#endif
- nfanode(v, &v->lacons[i], debug);
+
+ /* Prepend .* to pattern if it's a lookbehind LACON */
+ nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->subno), debug);
}
CNOERR();
if (v->tree->flags & SHORTER)
@@ -640,7 +648,7 @@ makesearch(struct vars * v,
static struct subre *
parse(struct vars * v,
int stopper, /* EOS or ')' */
- int type, /* LACON (lookahead subRE) or PLAIN */
+ int type, /* LACON (lookaround subRE) or PLAIN */
struct state * init, /* initial state */
struct state * final) /* final state */
{
@@ -719,7 +727,7 @@ parse(struct vars * v,
static struct subre *
parsebranch(struct vars * v,
int stopper, /* EOS or ')' */
- int type, /* LACON (lookahead subRE) or PLAIN */
+ int type, /* LACON (lookaround subRE) or PLAIN */
struct state * left, /* leftmost state */
struct state * right, /* rightmost state */
int partial) /* is this only part of a branch? */
@@ -768,7 +776,7 @@ parsebranch(struct vars * v,
static void
parseqatom(struct vars * v,
int stopper, /* EOS or ')' */
- int type, /* LACON (lookahead subRE) or PLAIN */
+ int type, /* LACON (lookaround subRE) or PLAIN */
struct state * lp, /* left state to hang it on */
struct state * rp, /* right state to hang it on */
struct subre * top) /* subtree top */
@@ -782,7 +790,7 @@ parseqatom(struct vars * v,
struct subre *atom; /* atom's subtree */
struct subre *t;
int cap; /* capturing parens? */
- int pos; /* positive lookahead? */
+ int latype; /* lookaround constraint type */
int subno; /* capturing-parens or backref number */
int atomtype;
int qprefer; /* quantifier short/long preference */
@@ -866,19 +874,18 @@ parseqatom(struct vars * v,
nonword(v, AHEAD, s, rp);
return;
break;
- case LACON: /* lookahead constraint */
- pos = v->nextvalue;
+ case LACON: /* lookaround constraint */
+ latype = v->nextvalue;
NEXT();
s = newstate(v->nfa);
s2 = newstate(v->nfa);
NOERR();
t = parse(v, ')', LACON, s, s2);
freesubre(v, t); /* internal structure irrelevant */
- assert(SEE(')') || ISERR());
- NEXT();
- n = newlacon(v, s, s2, pos);
NOERR();
- ARCV(LACON, n);
+ assert(SEE(')'));
+ NEXT();
+ processlacon(v, s, s2, latype, lp, rp);
return;
break;
/* then errors, to get them out of the way */
@@ -1634,6 +1641,75 @@ wordchrs(struct vars * v)
}
/*
+ * processlacon - generate the NFA representation of a LACON
+ *
+ * In the general case this is just newlacon() + newarc(), but some cases
+ * can be optimized.
+ */
+static void
+processlacon(struct vars * v,
+ struct state * begin, /* start of parsed LACON sub-re */
+ struct state * end, /* end of parsed LACON sub-re */
+ int latype,
+ struct state * lp, /* left state to hang it on */
+ struct state * rp) /* right state to hang it on */
+{
+ struct state *s1;
+ int n;
+
+ /*
+ * Check for lookaround RE consisting of a single plain color arc (or set
+ * of arcs); this would typically be a simple chr or a bracket expression.
+ */
+ s1 = single_color_transition(begin, end);
+ switch (latype)
+ {
+ case LATYPE_AHEAD_POS:
+ /* If lookahead RE is just colorset C, convert to AHEAD(C) */
+ if (s1 != NULL)
+ {
+ cloneouts(v->nfa, s1, lp, rp, AHEAD);
+ return;
+ }
+ break;
+ case LATYPE_AHEAD_NEG:
+ /* If lookahead RE is just colorset C, convert to AHEAD(^C)|$ */
+ if (s1 != NULL)
+ {
+ colorcomplement(v->nfa, v->cm, AHEAD, s1, lp, rp);
+ newarc(v->nfa, '$', 1, lp, rp);
+ newarc(v->nfa, '$', 0, lp, rp);
+ return;
+ }
+ break;
+ case LATYPE_BEHIND_POS:
+ /* If lookbehind RE is just colorset C, convert to BEHIND(C) */
+ if (s1 != NULL)
+ {
+ cloneouts(v->nfa, s1, lp, rp, BEHIND);
+ return;
+ }
+ break;
+ case LATYPE_BEHIND_NEG:
+ /* If lookbehind RE is just colorset C, convert to BEHIND(^C)|^ */
+ if (s1 != NULL)
+ {
+ colorcomplement(v->nfa, v->cm, BEHIND, s1, lp, rp);
+ newarc(v->nfa, '^', 1, lp, rp);
+ newarc(v->nfa, '^', 0, lp, rp);
+ return;
+ }
+ break;
+ default:
+ assert(NOTREACHED);
+ }
+
+ /* General case: we need a LACON subre and arc */
+ n = newlacon(v, begin, end, latype);
+ newarc(v->nfa, LACON, n, lp, rp);
+}
+
+/*
* subre - allocate a subre
*/
static struct subre *
@@ -1826,15 +1902,18 @@ nfatree(struct vars * v,
if (t->right != NULL)
(DISCARD) nfatree(v, t->right, f);
- return nfanode(v, t, f);
+ return nfanode(v, t, 0, f);
}
/*
- * nfanode - do one NFA for nfatree
+ * nfanode - do one NFA for nfatree or lacons
+ *
+ * If converttosearch is true, apply makesearch() to the NFA.
*/
static long /* optimize results */
nfanode(struct vars * v,
struct subre * t,
+ int converttosearch,
FILE *f) /* for debug output */
{
struct nfa *nfa;
@@ -1855,10 +1934,11 @@ nfanode(struct vars * v,
NOERRZ();
dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
if (!ISERR())
- {
specialcolors(nfa);
+ if (!ISERR())
ret = optimize(nfa, f);
- }
+ if (converttosearch && !ISERR())
+ makesearch(v, nfa);
if (!ISERR())
compact(nfa, &t->cnfa);
@@ -1867,13 +1947,13 @@ nfanode(struct vars * v,
}
/*
- * newlacon - allocate a lookahead-constraint subRE
+ * newlacon - allocate a lookaround-constraint subRE
*/
static int /* lacon number */
newlacon(struct vars * v,
struct state * begin,
struct state * end,
- int pos)
+ int latype)
{
int n;
struct subre *newlacons;
@@ -1900,13 +1980,13 @@ newlacon(struct vars * v,
sub = &v->lacons[n];
sub->begin = begin;
sub->end = end;
- sub->subno = pos;
+ sub->subno = latype;
ZAPCNFA(sub->cnfa);
return n;
}
/*
- * freelacons - free lookahead-constraint subRE vector
+ * freelacons - free lookaround-constraint subRE vector
*/
static void
freelacons(struct subre * subs,
@@ -2020,9 +2100,29 @@ dump(regex_t *re,
}
for (i = 1; i < g->nlacons; i++)
{
- fprintf(f, "\nla%d (%s):\n", i,
- (g->lacons[i].subno) ? "positive" : "negative");
- dumpcnfa(&g->lacons[i].cnfa, f);
+ struct subre *lasub = &g->lacons[i];
+ const char *latype;
+
+ switch (lasub->subno)
+ {
+ case LATYPE_AHEAD_POS:
+ latype = "positive lookahead";
+ break;
+ case LATYPE_AHEAD_NEG:
+ latype = "negative lookahead";
+ break;
+ case LATYPE_BEHIND_POS:
+ latype = "positive lookbehind";
+ break;
+ case LATYPE_BEHIND_NEG:
+ latype = "negative lookbehind";
+ break;
+ default:
+ latype = "???";
+ break;
+ }
+ fprintf(f, "\nla%d (%s):\n", i, latype);
+ dumpcnfa(&lasub->cnfa, f);
}
fprintf(f, "\n");
dumpst(g->tree, f, 0);
diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c
index a37e4b0ef96..7d90242acef 100644
--- a/src/backend/regex/rege_dfa.c
+++ b/src/backend/regex/rege_dfa.c
@@ -287,6 +287,130 @@ shortest(struct vars * v,
}
/*
+ * matchuntil - incremental matching engine
+ *
+ * This is meant for use with a search-style NFA (that is, the pattern is
+ * known to act as though it had a leading .*). We determine whether a
+ * match exists starting at v->start and ending at probe. Multiple calls
+ * require only O(N) time not O(N^2) so long as the probe values are
+ * nondecreasing. *lastcss and *lastcp must be initialized to NULL before
+ * starting a series of calls.
+ *
+ * Returns 1 if a match exists, 0 if not.
+ * Internal errors also return 0, with v->err set.
+ */
+static int
+matchuntil(struct vars * v,
+ struct dfa * d,
+ chr *probe, /* we want to know if a match ends here */
+ struct sset ** lastcss, /* state storage across calls */
+ chr **lastcp) /* state storage across calls */
+{
+ chr *cp = *lastcp;
+ color co;
+ struct sset *css = *lastcss;
+ struct sset *ss;
+ struct colormap *cm = d->cm;
+
+ /* initialize and startup, or restart, if necessary */
+ if (cp == NULL || cp > probe)
+ {
+ cp = v->start;
+ css = initialize(v, d, cp);
+ if (css == NULL)
+ return 0;
+
+ FDEBUG((">>> startup >>>\n"));
+ co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long) co));
+
+ css = miss(v, d, css, co, cp, v->start);
+ if (css == NULL)
+ return 0;
+ css->lastseen = cp;
+ }
+ else if (css == NULL)
+ {
+ /* we previously found that no match is possible beyond *lastcp */
+ return 0;
+ }
+ ss = css;
+
+ /*
+ * This is the main text-scanning loop. It seems worth having two copies
+ * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
+ * builds, when you're not actively tracing.
+ */
+#ifdef REG_DEBUG
+ if (v->eflags & REG_FTRACE)
+ {
+ while (cp < probe)
+ {
+ FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
+ co = GETCOLOR(cm, *cp);
+ FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+ ss = css->outs[co];
+ if (ss == NULL)
+ {
+ ss = miss(v, d, css, co, cp + 1, v->start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ }
+ }
+ else
+#endif
+ {
+ while (cp < probe)
+ {
+ co = GETCOLOR(cm, *cp);
+ ss = css->outs[co];
+ if (ss == NULL)
+ {
+ ss = miss(v, d, css, co, cp + 1, v->start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ }
+ }
+
+ *lastcss = ss;
+ *lastcp = cp;
+
+ if (ss == NULL)
+ return 0; /* impossible match, or internal error */
+
+ /* We need to process one more chr, or the EOS symbol, to check match */
+ if (cp < v->stop)
+ {
+ FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
+ co = GETCOLOR(cm, *cp);
+ FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+ ss = css->outs[co];
+ if (ss == NULL)
+ ss = miss(v, d, css, co, cp + 1, v->start);
+ }
+ else
+ {
+ assert(cp == v->stop);
+ co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long) co));
+ ss = miss(v, d, css, co, cp, v->start);
+ }
+
+ if (ss == NULL || !(ss->flags & POSTSTATE))
+ return 0;
+
+ return 1;
+}
+
+/*
* lastcold - determine last point at which no progress had been made
*/
static chr * /* endpoint, or NULL */
@@ -613,19 +737,19 @@ miss(struct vars * v,
}
/*
- * lacon - lookahead-constraint checker for miss()
+ * lacon - lookaround-constraint checker for miss()
*/
static int /* predicate: constraint satisfied? */
lacon(struct vars * v,
struct cnfa * pcnfa, /* parent cnfa */
chr *cp,
- pcolor co) /* "color" of the lookahead constraint */
+ pcolor co) /* "color" of the lookaround constraint */
{
int n;
struct subre *sub;
struct dfa *d;
- struct smalldfa sd;
chr *end;
+ int satisfied;
/* Since this is recursive, it could be driven to stack overflow */
if (STACK_TOO_DEEP(v->re))
@@ -635,19 +759,35 @@ lacon(struct vars * v,
}
n = co - pcnfa->ncolors;
- assert(n < v->g->nlacons && v->g->lacons != NULL);
+ assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
FDEBUG(("=== testing lacon %d\n", n));
sub = &v->g->lacons[n];
- d = newdfa(v, &sub->cnfa, &v->g->cmap, &sd);
+ d = getladfa(v, n);
if (d == NULL)
- {
- ERR(REG_ESPACE);
return 0;
+ if (LATYPE_IS_AHEAD(sub->subno))
+ {
+ /* used to use longest() here, but shortest() could be much cheaper */
+ end = shortest(v, d, cp, cp, v->stop,
+ (chr **) NULL, (int *) NULL);
+ satisfied = LATYPE_IS_POS(sub->subno) ? (end != NULL) : (end == NULL);
+ }
+ else
+ {
+ /*
+ * To avoid doing O(N^2) work when repeatedly testing a lookbehind
+ * constraint in an N-character string, we use matchuntil() which can
+ * cache the DFA state across calls. We only need to restart if the
+ * probe point decreases, which is not common. The NFA we're using is
+ * a search NFA, so it doesn't mind scanning over stuff before the
+ * nominal match.
+ */
+ satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
+ if (!LATYPE_IS_POS(sub->subno))
+ satisfied = !satisfied;
}
- end = longest(v, d, cp, v->stop, (int *) NULL);
- freedfa(d);
- FDEBUG(("=== lacon %d match %d\n", n, (end != NULL)));
- return (sub->subno) ? (end != NULL) : (end == NULL);
+ FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
+ return satisfied;
}
/*
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index 8a21f2cb787..82659a0f2f4 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -112,7 +112,10 @@ struct vars
chr *search_start; /* search start of string */
chr *stop; /* just past end of string */
int err; /* error code if any (0 none) */
- struct dfa **subdfas; /* per-subre DFAs */
+ struct dfa **subdfas; /* per-tree-subre DFAs */
+ struct dfa **ladfas; /* per-lacon-subre DFAs */
+ struct sset **lblastcss; /* per-lacon-subre lookbehind restart data */
+ chr **lblastcp; /* per-lacon-subre lookbehind restart data */
struct smalldfa dfa1;
struct smalldfa dfa2;
};
@@ -132,6 +135,7 @@ struct vars
*/
/* === regexec.c === */
static struct dfa *getsubdfa(struct vars *, struct subre *);
+static struct dfa *getladfa(struct vars *, int);
static int find(struct vars *, struct cnfa *, struct colormap *);
static int cfind(struct vars *, struct cnfa *, struct colormap *);
static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
@@ -149,6 +153,7 @@ static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
/* === rege_dfa.c === */
static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
+static int matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **);
static chr *lastcold(struct vars *, struct dfa *);
static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
static void freedfa(struct dfa *);
@@ -226,21 +231,54 @@ pg_regexec(regex_t *re,
v->search_start = (chr *) string + search_start;
v->stop = (chr *) string + len;
v->err = 0;
+ v->subdfas = NULL;
+ v->ladfas = NULL;
+ v->lblastcss = NULL;
+ v->lblastcp = NULL;
+ /* below this point, "goto cleanup" will behave sanely */
+
assert(v->g->ntree >= 0);
n = (size_t) v->g->ntree;
if (n <= LOCALDFAS)
v->subdfas = subdfas;
else
- v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
- if (v->subdfas == NULL)
{
- if (v->pmatch != pmatch && v->pmatch != mat)
- FREE(v->pmatch);
- return REG_ESPACE;
+ v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
+ if (v->subdfas == NULL)
+ {
+ st = REG_ESPACE;
+ goto cleanup;
+ }
}
for (i = 0; i < n; i++)
v->subdfas[i] = NULL;
+ assert(v->g->nlacons >= 0);
+ n = (size_t) v->g->nlacons;
+ if (n > 0)
+ {
+ v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
+ if (v->ladfas == NULL)
+ {
+ st = REG_ESPACE;
+ goto cleanup;
+ }
+ for (i = 0; i < n; i++)
+ v->ladfas[i] = NULL;
+ v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *));
+ v->lblastcp = (chr **) MALLOC(n * sizeof(chr *));
+ if (v->lblastcss == NULL || v->lblastcp == NULL)
+ {
+ st = REG_ESPACE;
+ goto cleanup;
+ }
+ for (i = 0; i < n; i++)
+ {
+ v->lblastcss[i] = NULL;
+ v->lblastcp[i] = NULL;
+ }
+ }
+
/* do it */
assert(v->g->tree != NULL);
if (backref)
@@ -257,22 +295,40 @@ pg_regexec(regex_t *re,
}
/* clean up */
+cleanup:
if (v->pmatch != pmatch && v->pmatch != mat)
FREE(v->pmatch);
- n = (size_t) v->g->ntree;
- for (i = 0; i < n; i++)
+ if (v->subdfas != NULL)
+ {
+ n = (size_t) v->g->ntree;
+ for (i = 0; i < n; i++)
+ {
+ if (v->subdfas[i] != NULL)
+ freedfa(v->subdfas[i]);
+ }
+ if (v->subdfas != subdfas)
+ FREE(v->subdfas);
+ }
+ if (v->ladfas != NULL)
{
- if (v->subdfas[i] != NULL)
- freedfa(v->subdfas[i]);
+ n = (size_t) v->g->nlacons;
+ for (i = 0; i < n; i++)
+ {
+ if (v->ladfas[i] != NULL)
+ freedfa(v->ladfas[i]);
+ }
+ FREE(v->ladfas);
}
- if (v->subdfas != subdfas)
- FREE(v->subdfas);
+ if (v->lblastcss != NULL)
+ FREE(v->lblastcss);
+ if (v->lblastcp != NULL)
+ FREE(v->lblastcp);
return st;
}
/*
- * getsubdfa - create or re-fetch the DFA for a subre node
+ * getsubdfa - create or re-fetch the DFA for a tree subre node
*
* We only need to create the DFA once per overall regex execution.
* The DFA will be freed by the cleanup step in pg_regexec().
@@ -291,6 +347,28 @@ getsubdfa(struct vars * v,
}
/*
+ * getladfa - create or re-fetch the DFA for a LACON subre node
+ *
+ * Same as above, but for LACONs.
+ */
+static struct dfa *
+getladfa(struct vars * v,
+ int n)
+{
+ assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
+
+ if (v->ladfas[n] == NULL)
+ {
+ struct subre *sub = &v->g->lacons[n];
+
+ v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
+ if (ISERR())
+ return NULL;
+ }
+ return v->ladfas[n];
+}
+
+/*
* find - find a match for the main NFA (no-complications case)
*/
static int
diff --git a/src/backend/regex/regexport.c b/src/backend/regex/regexport.c
index c5524ae4927..91340719e88 100644
--- a/src/backend/regex/regexport.c
+++ b/src/backend/regex/regexport.c
@@ -6,7 +6,7 @@
* In this implementation, the NFA defines a necessary but not sufficient
* condition for a string to match the regex: that is, there can be strings
* that match the NFA but don't match the full regex, but not vice versa.
- * Thus, for example, it is okay for the functions below to ignore lookahead
+ * Thus, for example, it is okay for the functions below to ignore lookaround
* constraints, which merely constrain the string some more.
*
* Notice that these functions return info into caller-provided arrays
diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c
index ce41620a0b4..86928453260 100644
--- a/src/backend/regex/regprefix.c
+++ b/src/backend/regex/regprefix.c
@@ -36,7 +36,7 @@ static int findprefix(struct cnfa * cnfa, struct colormap * cm,
* the common prefix or exact value, of length *slength (measured in chrs
* not bytes!).
*
- * This function does not analyze all complex cases (such as lookahead
+ * This function does not analyze all complex cases (such as lookaround
* constraints) exactly. Therefore it is possible that some strings matching
* the reported prefix or exact-match string do not satisfy the regex. But
* it should never be the case that a string satisfying the regex does not
diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h
index 5e1b692d26c..2f89dc9326b 100644
--- a/src/include/regex/regex.h
+++ b/src/include/regex/regex.h
@@ -58,7 +58,7 @@ typedef struct
size_t re_nsub; /* number of subexpressions */
long re_info; /* information about RE */
#define REG_UBACKREF 000001
-#define REG_ULOOKAHEAD 000002
+#define REG_ULOOKAROUND 000002
#define REG_UBOUNDS 000004
#define REG_UBRACES 000010
#define REG_UBSALNUM 000020
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 19fe991c74f..2ceffa6563b 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -89,13 +89,19 @@
*/
#define NOTREACHED 0
-#define xxx 1
#define DUPMAX _POSIX2_RE_DUP_MAX
#define DUPINF (DUPMAX+1)
#define REMAGIC 0xfed7 /* magic number for main struct */
+/* Type codes for lookaround constraints */
+#define LATYPE_AHEAD_POS 03 /* positive lookahead */
+#define LATYPE_AHEAD_NEG 02 /* negative lookahead */
+#define LATYPE_BEHIND_POS 01 /* positive lookbehind */
+#define LATYPE_BEHIND_NEG 00 /* negative lookbehind */
+#define LATYPE_IS_POS(la) ((la) & 01)
+#define LATYPE_IS_AHEAD(la) ((la) & 02)
/*
@@ -351,7 +357,7 @@ struct nfa
*
* The non-dummy carc structs are of two types: plain arcs and LACON arcs.
* Plain arcs just store the transition color number as "co". LACON arcs
- * store the lookahead constraint number plus cnfa.ncolors as "co". LACON
+ * store the lookaround constraint number plus cnfa.ncolors as "co". LACON
* arcs can be distinguished from plain by testing for co >= cnfa.ncolors.
*/
struct carc
@@ -365,7 +371,7 @@ struct cnfa
int nstates; /* number of states */
int ncolors; /* number of colors (max color in use + 1) */
int flags;
-#define HASLACONS 01 /* uses lookahead constraints */
+#define HASLACONS 01 /* uses lookaround constraints */
int pre; /* setup state number */
int post; /* teardown state number */
color bos[2]; /* colors, if any, assigned to BOS and BOL */
@@ -433,7 +439,8 @@ struct subre
#define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
short id; /* ID of subre (1..ntree-1) */
- int subno; /* subexpression number (for 'b' and '(') */
+ int subno; /* subexpression number for 'b' and '(', or
+ * LATYPE code for lookaround constraint */
short min; /* min repetitions for iteration or backref */
short max; /* max repetitions for iteration or backref */
struct subre *left; /* left child, if any (also freelist chain) */
@@ -479,6 +486,7 @@ struct guts
int ntree; /* number of subre's, plus one */
struct colormap cmap;
int FUNCPTR(compare, (const chr *, const chr *, size_t));
- struct subre *lacons; /* lookahead-constraint vector */
- int nlacons; /* size of lacons */
+ struct subre *lacons; /* lookaround-constraint vector */
+ int nlacons; /* size of lacons[]; note that only slots
+ * numbered 1 .. nlacons-1 are used */
};
diff --git a/src/test/regress/expected/regex.out b/src/test/regress/expected/regex.out
index be151858a38..f0e2fc9eb89 100644
--- a/src/test/regress/expected/regex.out
+++ b/src/test/regress/expected/regex.out
@@ -90,6 +90,175 @@ select substring('a' from '((a)+)');
a
(1 row)
+-- Test lookahead constraints
+select regexp_matches('ab', 'a(?=b)b*');
+ regexp_matches
+----------------
+ {ab}
+(1 row)
+
+select regexp_matches('a', 'a(?=b)b*');
+ regexp_matches
+----------------
+(0 rows)
+
+select regexp_matches('abc', 'a(?=b)b*(?=c)c*');
+ regexp_matches
+----------------
+ {abc}
+(1 row)
+
+select regexp_matches('ab', 'a(?=b)b*(?=c)c*');
+ regexp_matches
+----------------
+(0 rows)
+
+select regexp_matches('ab', 'a(?!b)b*');
+ regexp_matches
+----------------
+(0 rows)
+
+select regexp_matches('a', 'a(?!b)b*');
+ regexp_matches
+----------------
+ {a}
+(1 row)
+
+select regexp_matches('b', '(?=b)b');
+ regexp_matches
+----------------
+ {b}
+(1 row)
+
+select regexp_matches('a', '(?=b)b');
+ regexp_matches
+----------------
+(0 rows)
+
+-- Test lookbehind constraints
+select regexp_matches('abb', '(?<=a)b*');
+ regexp_matches
+----------------
+ {bb}
+(1 row)
+
+select regexp_matches('a', 'a(?<=a)b*');
+ regexp_matches
+----------------
+ {a}
+(1 row)
+
+select regexp_matches('abc', 'a(?<=a)b*(?<=b)c*');
+ regexp_matches
+----------------
+ {abc}
+(1 row)
+
+select regexp_matches('ab', 'a(?<=a)b*(?<=b)c*');
+ regexp_matches
+----------------
+ {ab}
+(1 row)
+
+select regexp_matches('ab', 'a*(?<!a)b*');
+ regexp_matches
+----------------
+ {""}
+(1 row)
+
+select regexp_matches('ab', 'a*(?<!a)b+');
+ regexp_matches
+----------------
+(0 rows)
+
+select regexp_matches('b', 'a*(?<!a)b+');
+ regexp_matches
+----------------
+ {b}
+(1 row)
+
+select regexp_matches('a', 'a(?<!a)b*');
+ regexp_matches
+----------------
+(0 rows)
+
+select regexp_matches('b', '(?<=b)b');
+ regexp_matches
+----------------
+(0 rows)
+
+select regexp_matches('foobar', '(?<=f)b+');
+ regexp_matches
+----------------
+(0 rows)
+
+select regexp_matches('foobar', '(?<=foo)b+');
+ regexp_matches
+----------------
+ {b}
+(1 row)
+
+select regexp_matches('foobar', '(?<=oo)b+');
+ regexp_matches
+----------------
+ {b}
+(1 row)
+
+-- Test optimization of single-chr-or-bracket-expression lookaround constraints
+select 'xz' ~ 'x(?=[xy])';
+ ?column?
+----------
+ f
+(1 row)
+
+select 'xy' ~ 'x(?=[xy])';
+ ?column?
+----------
+ t
+(1 row)
+
+select 'xz' ~ 'x(?![xy])';
+ ?column?
+----------
+ t
+(1 row)
+
+select 'xy' ~ 'x(?![xy])';
+ ?column?
+----------
+ f
+(1 row)
+
+select 'x' ~ 'x(?![xy])';
+ ?column?
+----------
+ t
+(1 row)
+
+select 'xyy' ~ '(?<=[xy])yy+';
+ ?column?
+----------
+ t
+(1 row)
+
+select 'zyy' ~ '(?<=[xy])yy+';
+ ?column?
+----------
+ f
+(1 row)
+
+select 'xyy' ~ '(?<![xy])yy+';
+ ?column?
+----------
+ f
+(1 row)
+
+select 'zyy' ~ '(?<![xy])yy+';
+ ?column?
+----------
+ t
+(1 row)
+
-- Test conversion of regex patterns to indexable conditions
explain (costs off) select * from pg_proc where proname ~ 'abc';
QUERY PLAN
diff --git a/src/test/regress/sql/regex.sql b/src/test/regress/sql/regex.sql
index c59fa35f24d..d3030af295d 100644
--- a/src/test/regress/sql/regex.sql
+++ b/src/test/regress/sql/regex.sql
@@ -25,6 +25,41 @@ select substring('asd TO foo' from ' TO (([a-z0-9._]+|"([^"]+|"")+")+)');
select substring('a' from '((a))+');
select substring('a' from '((a)+)');
+-- Test lookahead constraints
+select regexp_matches('ab', 'a(?=b)b*');
+select regexp_matches('a', 'a(?=b)b*');
+select regexp_matches('abc', 'a(?=b)b*(?=c)c*');
+select regexp_matches('ab', 'a(?=b)b*(?=c)c*');
+select regexp_matches('ab', 'a(?!b)b*');
+select regexp_matches('a', 'a(?!b)b*');
+select regexp_matches('b', '(?=b)b');
+select regexp_matches('a', '(?=b)b');
+
+-- Test lookbehind constraints
+select regexp_matches('abb', '(?<=a)b*');
+select regexp_matches('a', 'a(?<=a)b*');
+select regexp_matches('abc', 'a(?<=a)b*(?<=b)c*');
+select regexp_matches('ab', 'a(?<=a)b*(?<=b)c*');
+select regexp_matches('ab', 'a*(?<!a)b*');
+select regexp_matches('ab', 'a*(?<!a)b+');
+select regexp_matches('b', 'a*(?<!a)b+');
+select regexp_matches('a', 'a(?<!a)b*');
+select regexp_matches('b', '(?<=b)b');
+select regexp_matches('foobar', '(?<=f)b+');
+select regexp_matches('foobar', '(?<=foo)b+');
+select regexp_matches('foobar', '(?<=oo)b+');
+
+-- Test optimization of single-chr-or-bracket-expression lookaround constraints
+select 'xz' ~ 'x(?=[xy])';
+select 'xy' ~ 'x(?=[xy])';
+select 'xz' ~ 'x(?![xy])';
+select 'xy' ~ 'x(?![xy])';
+select 'x' ~ 'x(?![xy])';
+select 'xyy' ~ '(?<=[xy])yy+';
+select 'zyy' ~ '(?<=[xy])yy+';
+select 'xyy' ~ '(?<![xy])yy+';
+select 'zyy' ~ '(?<![xy])yy+';
+
-- Test conversion of regex patterns to indexable conditions
explain (costs off) select * from pg_proc where proname ~ 'abc';
explain (costs off) select * from pg_proc where proname ~ '^abc';