diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2016-09-05 17:06:29 -0400 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2016-09-05 17:06:29 -0400 |
commit | c54159d44ceaba26ceda9fea1804f0de122a8f30 (patch) | |
tree | eaab96027f054cf5ff864d0745e446e8b8e13544 /src/backend/regex/regcomp.c | |
parent | f80049f76a32858601510eaaef19ab8160e4c9b3 (diff) | |
download | postgresql-c54159d44ceaba26ceda9fea1804f0de122a8f30.tar.gz postgresql-c54159d44ceaba26ceda9fea1804f0de122a8f30.zip |
Make locale-dependent regex character classes work for large char codes.
Previously, we failed to recognize Unicode characters above U+7FF as
being members of locale-dependent character classes such as [[:alpha:]].
(Actually, the same problem occurs for large pg_wchar values in any
multibyte encoding, but UTF8 is the only case people have actually
complained about.) It's impractical to get Spencer's original code to
handle character classes or ranges containing many thousands of characters,
because it insists on considering each member character individually at
regex compile time, whether or not the character will ever be of interest
at run time. To fix, choose a cutoff point MAX_SIMPLE_CHR below which
we process characters individually as before, and deal with entire ranges
or classes as single entities above that. We can actually make things
cheaper than before for chars below the cutoff, because the color map can
now be a simple linear array for those chars, rather than the multilevel
tree structure Spencer designed. It's more expensive than before for
chars above the cutoff, because we must do a binary search in a list of
high chars and char ranges used in the regex pattern, plus call iswalpha()
and friends for each locale-dependent character class used in the pattern.
However, multibyte encodings are normally designed to give smaller codes
to popular characters, so that we can expect that the slow path will be
taken relatively infrequently. In any case, the speed penalty appears
minor except when we have to apply iswalpha() etc. to high character codes
at runtime --- and the previous coding gave wrong answers for those cases,
so whether it was faster is moot.
Tom Lane, reviewed by Heikki Linnakangas
Discussion: <15563.1471913698@sss.pgh.pa.us>
Diffstat (limited to 'src/backend/regex/regcomp.c')
-rw-r--r-- | src/backend/regex/regcomp.c | 62 |
1 files changed, 16 insertions, 46 deletions
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index b211cc0a189..ed95474b3f0 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -55,7 +55,6 @@ static void cbracket(struct vars *, struct state *, struct state *); static void brackpart(struct vars *, struct state *, struct state *); static const chr *scanplain(struct vars *); static void onechr(struct vars *, chr, struct state *, struct state *); -static void dovec(struct vars *, struct cvec *, struct state *, struct state *); static void wordchrs(struct vars *); static void processlacon(struct vars *, struct state *, struct state *, int, struct state *, struct state *); @@ -96,16 +95,19 @@ static chr chrnamed(struct vars *, const chr *, const chr *, chr); /* === regc_color.c === */ static void initcm(struct vars *, struct colormap *); static void freecm(struct colormap *); -static void cmtreefree(struct colormap *, union tree *, int); -static color setcolor(struct colormap *, chr, color); static color maxcolor(struct colormap *); static color newcolor(struct colormap *); static void freecolor(struct colormap *, color); static color pseudocolor(struct colormap *); -static color subcolor(struct colormap *, chr c); +static color subcolor(struct colormap *, chr); +static color subcolorhi(struct colormap *, color *); static color newsub(struct colormap *, color); -static void subrange(struct vars *, chr, chr, struct state *, struct state *); -static void subblock(struct vars *, chr, struct state *, struct state *); +static int newhicolorrow(struct colormap *, int); +static void newhicolorcols(struct colormap *); +static void subcolorcvec(struct vars *, struct cvec *, struct state *, struct state *); +static void subcoloronechr(struct vars *, chr, struct state *, struct state *, color *); +static void subcoloronerange(struct vars *, chr, chr, struct state *, struct state *, color *); +static void subcoloronerow(struct vars *, int, struct state *, struct state *, color *); static void okcolors(struct nfa *, struct colormap *); static void colorchain(struct colormap *, struct arc *); static void uncolorchain(struct colormap *, struct arc *); @@ -114,7 +116,6 @@ static void colorcomplement(struct nfa *, struct colormap *, int, struct state * #ifdef REG_DEBUG static void dumpcolors(struct colormap *, FILE *); -static void fillcheck(struct colormap *, union tree *, int, FILE *); static void dumpchr(chr, FILE *); #endif /* === regc_nfa.c === */ @@ -215,6 +216,7 @@ static struct cvec *range(struct vars *, chr, chr, int); static int before(chr, chr); static struct cvec *eclass(struct vars *, chr, int); static struct cvec *cclass(struct vars *, const chr *, const chr *, int); +static int cclass_column_index(struct colormap *, chr); static struct cvec *allcases(struct vars *, chr); static int cmp(const chr *, const chr *, size_t); static int casecmp(const chr *, const chr *, size_t); @@ -1467,7 +1469,7 @@ brackpart(struct vars * v, NOERR(); cv = eclass(v, startc, (v->cflags & REG_ICASE)); NOERR(); - dovec(v, cv, lp, rp); + subcolorcvec(v, cv, lp, rp); return; break; case CCLASS: @@ -1477,7 +1479,7 @@ brackpart(struct vars * v, NOERR(); cv = cclass(v, startp, endp, (v->cflags & REG_ICASE)); NOERR(); - dovec(v, cv, lp, rp); + subcolorcvec(v, cv, lp, rp); return; break; default: @@ -1523,7 +1525,7 @@ brackpart(struct vars * v, NOTE(REG_UUNPORT); cv = range(v, startc, endc, (v->cflags & REG_ICASE)); NOERR(); - dovec(v, cv, lp, rp); + subcolorcvec(v, cv, lp, rp); } /* @@ -1565,46 +1567,14 @@ onechr(struct vars * v, { if (!(v->cflags & REG_ICASE)) { - newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp); + color lastsubcolor = COLORLESS; + + subcoloronechr(v, c, lp, rp, &lastsubcolor); return; } /* rats, need general case anyway... */ - dovec(v, allcases(v, c), lp, rp); -} - -/* - * dovec - fill in arcs for each element of a cvec - */ -static void -dovec(struct vars * v, - struct cvec * cv, - struct state * lp, - struct state * rp) -{ - chr ch, - from, - to; - const chr *p; - int i; - - /* ordinary characters */ - for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) - { - ch = *p; - newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp); - NOERR(); - } - - /* and the ranges */ - for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) - { - from = *p; - to = *(p + 1); - if (from <= to) - subrange(v, from, to, lp, rp); - NOERR(); - } + subcolorcvec(v, allcases(v, c), lp, rp); } /* |