diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2012-02-24 01:40:18 -0500 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2012-02-24 01:41:03 -0500 |
commit | 173e29aa5deefd9e71c183583ba37805c8102a72 (patch) | |
tree | f7e997faabaa7aa3e19e00ee561096404817d092 /src/backend/regex/regexec.c | |
parent | 0c9e5d5e0d407013bf66af01942a7b2dd3342546 (diff) | |
download | postgresql-173e29aa5deefd9e71c183583ba37805c8102a72.tar.gz postgresql-173e29aa5deefd9e71c183583ba37805c8102a72.zip |
Fix the general case of quantified regex back-references.
Cases where a back-reference is part of a larger subexpression that
is quantified have never worked in Spencer's regex engine, because
he used a compile-time transformation that neglected the need to
check the back-reference match in iterations before the last one.
(That was okay for capturing parens, and we still do it if the
regex has *only* capturing parens ... but it's not okay for backrefs.)
To make this work properly, we have to add an "iteration" node type
to the regex engine's vocabulary of sub-regex nodes. Since this is a
moderately large change with a fair risk of introducing new bugs of its
own, apply to HEAD only, even though it's a fix for a longstanding bug.
Diffstat (limited to 'src/backend/regex/regexec.c')
-rw-r--r-- | src/backend/regex/regexec.c | 764 |
1 files changed, 757 insertions, 7 deletions
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index 224da5064b6..ea16e39a6ed 100644 --- a/src/backend/regex/regexec.c +++ b/src/backend/regex/regexec.c @@ -140,11 +140,15 @@ static void subset(struct vars *, struct subre *, chr *, chr *); static int dissect(struct vars *, struct subre *, chr *, chr *); static int condissect(struct vars *, struct subre *, chr *, chr *); static int altdissect(struct vars *, struct subre *, chr *, chr *); +static int iterdissect(struct vars *, struct subre *, chr *, chr *); +static int reviterdissect(struct vars *, struct subre *, chr *, chr *); static int cdissect(struct vars *, struct subre *, chr *, chr *); static int ccondissect(struct vars *, struct subre *, chr *, chr *); static int crevdissect(struct vars *, struct subre *, chr *, chr *); static int cbrdissect(struct vars *, struct subre *, chr *, chr *); static int caltdissect(struct vars *, struct subre *, chr *, chr *); +static int citerdissect(struct vars *, struct subre *, chr *, chr *); +static int creviterdissect(struct vars *, struct subre *, chr *, chr *); /* === rege_dfa.c === */ static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *); @@ -563,14 +567,17 @@ dissect(struct vars * v, case '=': /* terminal node */ assert(t->left == NULL && t->right == NULL); return REG_OKAY; /* no action, parent did the work */ - case '|': /* alternation */ - assert(t->left != NULL); - return altdissect(v, t, begin, end); case 'b': /* back ref -- shouldn't be calling us! */ return REG_ASSERT; case '.': /* concatenation */ assert(t->left != NULL && t->right != NULL); return condissect(v, t, begin, end); + case '|': /* alternation */ + assert(t->left != NULL); + return altdissect(v, t, begin, end); + case '*': /* iteration */ + assert(t->left != NULL); + return iterdissect(v, t, begin, end); case '(': /* capturing */ assert(t->left != NULL && t->right == NULL); assert(t->subno > 0); @@ -697,6 +704,375 @@ altdissect(struct vars * v, } /* + * iterdissect - iteration subexpression matches (uncomplicated) + */ +static int /* regexec return code */ +iterdissect(struct vars * v, + struct subre * t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + chr **endpts; + chr *limit; + int min_matches; + size_t max_matches; + int nverified; + int k; + int i; + int er; + + assert(t->op == '*'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(begin <= end); + + if (t->left->flags & SHORTER) /* reverse scan */ + return reviterdissect(v, t, begin, end); + + /* + * If zero matches are allowed, and target string is empty, just declare + * victory. OTOH, if target string isn't empty, zero matches can't work + * so we pretend the min is 1. + */ + min_matches = t->min; + if (min_matches <= 0) + { + if (begin == end) + return REG_OKAY; + min_matches = 1; + } + + /* + * We need workspace to track the endpoints of each sub-match. Normally + * we consider only nonzero-length sub-matches, so there can be at most + * end-begin of them. However, if min is larger than that, we will also + * consider zero-length sub-matches in order to find enough matches. + * + * For convenience, endpts[0] contains the "begin" pointer and we store + * sub-match endpoints in endpts[1..max_matches]. + */ + max_matches = end - begin; + if (max_matches > t->max && t->max != INFINITY) + max_matches = t->max; + if (max_matches < min_matches) + max_matches = min_matches; + endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *)); + if (endpts == NULL) + return REG_ESPACE; + endpts[0] = begin; + + d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) + { + FREE(endpts); + return v->err; + } + MDEBUG(("iter %d\n", t->retry)); + + /* + * Our strategy is to first find a set of sub-match endpoints that are + * valid according to the child node's DFA, and then recursively dissect + * each sub-match to confirm validity. If any validity check fails, + * backtrack the last sub-match and try again. And, when we next try for + * a validity check, we need not recheck any successfully verified + * sub-matches that we didn't move the endpoints of. nverified remembers + * how many sub-matches are currently known okay. + */ + + /* initialize to consider first sub-match */ + nverified = 0; + k = 1; + limit = end; + + /* iterate until satisfaction or failure */ + while (k > 0) + { + /* try to find an endpoint for the k'th sub-match */ + endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL); + if (endpts[k] == NULL) + { + /* no match possible, so see if we can shorten previous one */ + k--; + goto backtrack; + } + MDEBUG(("%d: working endpoint %d: %ld\n", + t->retry, k, LOFF(endpts[k]))); + + /* k'th sub-match can no longer be considered verified */ + if (nverified >= k) + nverified = k - 1; + + if (endpts[k] != end) + { + /* haven't reached end yet, try another iteration if allowed */ + if (k >= max_matches) + { + /* must try to shorten some previous match */ + k--; + goto backtrack; + } + + /* reject zero-length match unless necessary to achieve min */ + if (endpts[k] == endpts[k - 1] && + (k >= min_matches || min_matches - k < end - endpts[k])) + goto backtrack; + + k++; + limit = end; + continue; + } + + /* + * We've identified a way to divide the string into k sub-matches + * that works so far as the child DFA can tell. If k is an allowed + * number of matches, start the slow part: recurse to verify each + * sub-match. We always have k <= max_matches, needn't check that. + */ + if (k < min_matches) + goto backtrack; + + MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k)); + + for (i = nverified + 1; i <= k; i++) + { + er = dissect(v, t->left, endpts[i - 1], endpts[i]); + if (er == REG_OKAY) + { + nverified = i; + continue; + } + if (er == REG_NOMATCH) + break; + /* oops, something failed */ + freedfa(d); + FREE(endpts); + return er; + } + + if (i > k) + { + /* satisfaction */ + MDEBUG(("%d successful\n", t->retry)); + freedfa(d); + FREE(endpts); + return REG_OKAY; + } + + /* match failed to verify, so backtrack */ + +backtrack: + /* + * Must consider shorter versions of the current sub-match. However, + * we'll only ask for a zero-length match if necessary. + */ + while (k > 0) + { + chr *prev_end = endpts[k - 1]; + + if (endpts[k] > prev_end) + { + limit = endpts[k] - 1; + if (limit > prev_end || + (k < min_matches && min_matches - k >= end - prev_end)) + { + /* break out of backtrack loop, continue the outer one */ + break; + } + } + /* can't shorten k'th sub-match any more, consider previous one */ + k--; + } + } + + /* all possibilities exhausted - shouldn't happen in uncomplicated mode */ + MDEBUG(("%d failed\n", t->retry)); + freedfa(d); + FREE(endpts); + return REG_ASSERT; +} + +/* + * reviterdissect - shortest-first iteration subexpression matches + */ +static int /* regexec return code */ +reviterdissect(struct vars * v, + struct subre * t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + chr **endpts; + chr *limit; + int min_matches; + size_t max_matches; + int nverified; + int k; + int i; + int er; + + assert(t->op == '*'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(t->left->flags & SHORTER); + assert(begin <= end); + + /* + * If zero matches are allowed, and target string is empty, just declare + * victory. OTOH, if target string isn't empty, zero matches can't work + * so we pretend the min is 1. + */ + min_matches = t->min; + if (min_matches <= 0) + { + if (begin == end) + return REG_OKAY; + min_matches = 1; + } + + /* + * We need workspace to track the endpoints of each sub-match. Normally + * we consider only nonzero-length sub-matches, so there can be at most + * end-begin of them. However, if min is larger than that, we will also + * consider zero-length sub-matches in order to find enough matches. + * + * For convenience, endpts[0] contains the "begin" pointer and we store + * sub-match endpoints in endpts[1..max_matches]. + */ + max_matches = end - begin; + if (max_matches > t->max && t->max != INFINITY) + max_matches = t->max; + if (max_matches < min_matches) + max_matches = min_matches; + endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *)); + if (endpts == NULL) + return REG_ESPACE; + endpts[0] = begin; + + d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) + { + FREE(endpts); + return v->err; + } + MDEBUG(("reviter %d\n", t->retry)); + + /* + * Our strategy is to first find a set of sub-match endpoints that are + * valid according to the child node's DFA, and then recursively dissect + * each sub-match to confirm validity. If any validity check fails, + * backtrack the last sub-match and try again. And, when we next try for + * a validity check, we need not recheck any successfully verified + * sub-matches that we didn't move the endpoints of. nverified remembers + * how many sub-matches are currently known okay. + */ + + /* initialize to consider first sub-match */ + nverified = 0; + k = 1; + limit = begin; + + /* iterate until satisfaction or failure */ + while (k > 0) + { + /* disallow zero-length match unless necessary to achieve min */ + if (limit == endpts[k - 1] && + limit != end && + (k >= min_matches || min_matches - k < end - limit)) + limit++; + + /* try to find an endpoint for the k'th sub-match */ + endpts[k] = shortest(v, d, endpts[k - 1], limit, end, + (chr **) NULL, (int *) NULL); + if (endpts[k] == NULL) + { + /* no match possible, so see if we can lengthen previous one */ + k--; + goto backtrack; + } + MDEBUG(("%d: working endpoint %d: %ld\n", + t->retry, k, LOFF(endpts[k]))); + + /* k'th sub-match can no longer be considered verified */ + if (nverified >= k) + nverified = k - 1; + + if (endpts[k] != end) + { + /* haven't reached end yet, try another iteration if allowed */ + if (k >= max_matches) + { + /* must try to lengthen some previous match */ + k--; + goto backtrack; + } + + k++; + limit = endpts[k - 1]; + continue; + } + + /* + * We've identified a way to divide the string into k sub-matches + * that works so far as the child DFA can tell. If k is an allowed + * number of matches, start the slow part: recurse to verify each + * sub-match. We always have k <= max_matches, needn't check that. + */ + if (k < min_matches) + goto backtrack; + + MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k)); + + for (i = nverified + 1; i <= k; i++) + { + er = dissect(v, t->left, endpts[i - 1], endpts[i]); + if (er == REG_OKAY) + { + nverified = i; + continue; + } + if (er == REG_NOMATCH) + break; + /* oops, something failed */ + freedfa(d); + FREE(endpts); + return er; + } + + if (i > k) + { + /* satisfaction */ + MDEBUG(("%d successful\n", t->retry)); + freedfa(d); + FREE(endpts); + return REG_OKAY; + } + + /* match failed to verify, so backtrack */ + +backtrack: + /* + * Must consider longer versions of the current sub-match. + */ + while (k > 0) + { + if (endpts[k] < end) + { + limit = endpts[k] + 1; + /* break out of backtrack loop, continue the outer one */ + break; + } + /* can't lengthen k'th sub-match any more, consider previous one */ + k--; + } + } + + /* all possibilities exhausted - shouldn't happen in uncomplicated mode */ + MDEBUG(("%d failed\n", t->retry)); + freedfa(d); + FREE(endpts); + return REG_ASSERT; +} + +/* * cdissect - determine subexpression matches (with complications) * The retry memory stores the offset of the trial midpoint from begin, * plus 1 so that 0 uniquely means "clean slate". @@ -717,15 +1093,18 @@ cdissect(struct vars * v, case '=': /* terminal node */ assert(t->left == NULL && t->right == NULL); return REG_OKAY; /* no action, parent did the work */ - case '|': /* alternation */ - assert(t->left != NULL); - return caltdissect(v, t, begin, end); case 'b': /* back reference */ assert(t->left == NULL && t->right == NULL); return cbrdissect(v, t, begin, end); case '.': /* concatenation */ assert(t->left != NULL && t->right != NULL); return ccondissect(v, t, begin, end); + case '|': /* alternation */ + assert(t->left != NULL); + return caltdissect(v, t, begin, end); + case '*': /* iteration */ + assert(t->left != NULL); + return citerdissect(v, t, begin, end); case '(': /* capturing */ assert(t->left != NULL && t->right == NULL); assert(t->subno > 0); @@ -847,7 +1226,7 @@ ccondissect(struct vars * v, } /* - * crevdissect - determine backref shortest-first subexpression matches + * crevdissect - shortest-first concatenation subexpression matches * The retry memory stores the offset of the trial midpoint from begin, * plus 1 so that 0 uniquely means "clean slate". */ @@ -1088,6 +1467,377 @@ caltdissect(struct vars * v, return caltdissect(v, t->right, begin, end); } +/* + * citerdissect - iteration subexpression matches (with complications) + */ +static int /* regexec return code */ +citerdissect(struct vars * v, + struct subre * t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + chr **endpts; + chr *limit; + int min_matches; + size_t max_matches; + int nverified; + int k; + int i; + int er; + + assert(t->op == '*'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(begin <= end); + + if (t->left->flags & SHORTER) /* reverse scan */ + return creviterdissect(v, t, begin, end); + + /* + * If zero matches are allowed, and target string is empty, just declare + * victory. OTOH, if target string isn't empty, zero matches can't work + * so we pretend the min is 1. + */ + min_matches = t->min; + if (min_matches <= 0) + { + if (begin == end) + return REG_OKAY; + min_matches = 1; + } + + /* + * We need workspace to track the endpoints of each sub-match. Normally + * we consider only nonzero-length sub-matches, so there can be at most + * end-begin of them. However, if min is larger than that, we will also + * consider zero-length sub-matches in order to find enough matches. + * + * For convenience, endpts[0] contains the "begin" pointer and we store + * sub-match endpoints in endpts[1..max_matches]. + */ + max_matches = end - begin; + if (max_matches > t->max && t->max != INFINITY) + max_matches = t->max; + if (max_matches < min_matches) + max_matches = min_matches; + endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *)); + if (endpts == NULL) + return REG_ESPACE; + endpts[0] = begin; + + d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) + { + FREE(endpts); + return v->err; + } + MDEBUG(("citer %d\n", t->retry)); + + /* + * Our strategy is to first find a set of sub-match endpoints that are + * valid according to the child node's DFA, and then recursively dissect + * each sub-match to confirm validity. If any validity check fails, + * backtrack the last sub-match and try again. And, when we next try for + * a validity check, we need not recheck any successfully verified + * sub-matches that we didn't move the endpoints of. nverified remembers + * how many sub-matches are currently known okay. + */ + + /* initialize to consider first sub-match */ + nverified = 0; + k = 1; + limit = end; + + /* iterate until satisfaction or failure */ + while (k > 0) + { + /* try to find an endpoint for the k'th sub-match */ + endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL); + if (endpts[k] == NULL) + { + /* no match possible, so see if we can shorten previous one */ + k--; + goto backtrack; + } + MDEBUG(("%d: working endpoint %d: %ld\n", + t->retry, k, LOFF(endpts[k]))); + + /* k'th sub-match can no longer be considered verified */ + if (nverified >= k) + nverified = k - 1; + + if (endpts[k] != end) + { + /* haven't reached end yet, try another iteration if allowed */ + if (k >= max_matches) + { + /* must try to shorten some previous match */ + k--; + goto backtrack; + } + + /* reject zero-length match unless necessary to achieve min */ + if (endpts[k] == endpts[k - 1] && + (k >= min_matches || min_matches - k < end - endpts[k])) + goto backtrack; + + k++; + limit = end; + continue; + } + + /* + * We've identified a way to divide the string into k sub-matches + * that works so far as the child DFA can tell. If k is an allowed + * number of matches, start the slow part: recurse to verify each + * sub-match. We always have k <= max_matches, needn't check that. + */ + if (k < min_matches) + goto backtrack; + + MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k)); + + for (i = nverified + 1; i <= k; i++) + { + zapmem(v, t->left); + er = cdissect(v, t->left, endpts[i - 1], endpts[i]); + if (er == REG_OKAY) + { + nverified = i; + continue; + } + if (er == REG_NOMATCH) + break; + /* oops, something failed */ + freedfa(d); + FREE(endpts); + return er; + } + + if (i > k) + { + /* satisfaction */ + MDEBUG(("%d successful\n", t->retry)); + freedfa(d); + FREE(endpts); + return REG_OKAY; + } + + /* match failed to verify, so backtrack */ + +backtrack: + /* + * Must consider shorter versions of the current sub-match. However, + * we'll only ask for a zero-length match if necessary. + */ + while (k > 0) + { + chr *prev_end = endpts[k - 1]; + + if (endpts[k] > prev_end) + { + limit = endpts[k] - 1; + if (limit > prev_end || + (k < min_matches && min_matches - k >= end - prev_end)) + { + /* break out of backtrack loop, continue the outer one */ + break; + } + } + /* can't shorten k'th sub-match any more, consider previous one */ + k--; + } + } + + /* all possibilities exhausted */ + MDEBUG(("%d failed\n", t->retry)); + freedfa(d); + FREE(endpts); + return REG_NOMATCH; +} + +/* + * creviterdissect - shortest-first iteration subexpression matches + */ +static int /* regexec return code */ +creviterdissect(struct vars * v, + struct subre * t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + chr **endpts; + chr *limit; + int min_matches; + size_t max_matches; + int nverified; + int k; + int i; + int er; + + assert(t->op == '*'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(t->left->flags & SHORTER); + assert(begin <= end); + + /* + * If zero matches are allowed, and target string is empty, just declare + * victory. OTOH, if target string isn't empty, zero matches can't work + * so we pretend the min is 1. + */ + min_matches = t->min; + if (min_matches <= 0) + { + if (begin == end) + return REG_OKAY; + min_matches = 1; + } + + /* + * We need workspace to track the endpoints of each sub-match. Normally + * we consider only nonzero-length sub-matches, so there can be at most + * end-begin of them. However, if min is larger than that, we will also + * consider zero-length sub-matches in order to find enough matches. + * + * For convenience, endpts[0] contains the "begin" pointer and we store + * sub-match endpoints in endpts[1..max_matches]. + */ + max_matches = end - begin; + if (max_matches > t->max && t->max != INFINITY) + max_matches = t->max; + if (max_matches < min_matches) + max_matches = min_matches; + endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *)); + if (endpts == NULL) + return REG_ESPACE; + endpts[0] = begin; + + d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) + { + FREE(endpts); + return v->err; + } + MDEBUG(("creviter %d\n", t->retry)); + + /* + * Our strategy is to first find a set of sub-match endpoints that are + * valid according to the child node's DFA, and then recursively dissect + * each sub-match to confirm validity. If any validity check fails, + * backtrack the last sub-match and try again. And, when we next try for + * a validity check, we need not recheck any successfully verified + * sub-matches that we didn't move the endpoints of. nverified remembers + * how many sub-matches are currently known okay. + */ + + /* initialize to consider first sub-match */ + nverified = 0; + k = 1; + limit = begin; + + /* iterate until satisfaction or failure */ + while (k > 0) + { + /* disallow zero-length match unless necessary to achieve min */ + if (limit == endpts[k - 1] && + limit != end && + (k >= min_matches || min_matches - k < end - limit)) + limit++; + + /* try to find an endpoint for the k'th sub-match */ + endpts[k] = shortest(v, d, endpts[k - 1], limit, end, + (chr **) NULL, (int *) NULL); + if (endpts[k] == NULL) + { + /* no match possible, so see if we can lengthen previous one */ + k--; + goto backtrack; + } + MDEBUG(("%d: working endpoint %d: %ld\n", + t->retry, k, LOFF(endpts[k]))); + + /* k'th sub-match can no longer be considered verified */ + if (nverified >= k) + nverified = k - 1; + + if (endpts[k] != end) + { + /* haven't reached end yet, try another iteration if allowed */ + if (k >= max_matches) + { + /* must try to lengthen some previous match */ + k--; + goto backtrack; + } + + k++; + limit = endpts[k - 1]; + continue; + } + + /* + * We've identified a way to divide the string into k sub-matches + * that works so far as the child DFA can tell. If k is an allowed + * number of matches, start the slow part: recurse to verify each + * sub-match. We always have k <= max_matches, needn't check that. + */ + if (k < min_matches) + goto backtrack; + + MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k)); + + for (i = nverified + 1; i <= k; i++) + { + zapmem(v, t->left); + er = cdissect(v, t->left, endpts[i - 1], endpts[i]); + if (er == REG_OKAY) + { + nverified = i; + continue; + } + if (er == REG_NOMATCH) + break; + /* oops, something failed */ + freedfa(d); + FREE(endpts); + return er; + } + + if (i > k) + { + /* satisfaction */ + MDEBUG(("%d successful\n", t->retry)); + freedfa(d); + FREE(endpts); + return REG_OKAY; + } + + /* match failed to verify, so backtrack */ + +backtrack: + /* + * Must consider longer versions of the current sub-match. + */ + while (k > 0) + { + if (endpts[k] < end) + { + limit = endpts[k] + 1; + /* break out of backtrack loop, continue the outer one */ + break; + } + /* can't lengthen k'th sub-match any more, consider previous one */ + k--; + } + } + + /* all possibilities exhausted */ + MDEBUG(("%d failed\n", t->retry)); + freedfa(d); + FREE(endpts); + return REG_NOMATCH; +} + #include "rege_dfa.c" |