aboutsummaryrefslogtreecommitdiff
path: root/src/backend/regex/regcomp.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2021-02-20 18:11:56 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2021-02-20 18:11:56 -0500
commit08c0d6ad65f7c161add82ae906efb90dbd7f653d (patch)
treecc6376d6fd084c6584b18d818c0816e3b7e190c9 /src/backend/regex/regcomp.c
parent17661188336c8cbb1783808912096932c57893a3 (diff)
downloadpostgresql-08c0d6ad65f7c161add82ae906efb90dbd7f653d.tar.gz
postgresql-08c0d6ad65f7c161add82ae906efb90dbd7f653d.zip
Invent "rainbow" arcs within the regex engine.
Some regular expression constructs, most notably the "." match-anything metacharacter, produce a sheaf of parallel NFA arcs covering all possible colors (that is, character equivalence classes). We can make a noticeable improvement in the space and time needed to process large regexes by replacing such cases with a single arc bearing the special color code "RAINBOW". This requires only minor additional complication in places such as pull() and push(). Callers of pg_reg_getoutarcs() must now be prepared for the possibility of seeing a RAINBOW arc. For the one known user, contrib/pg_trgm, that's a net benefit since it cuts the number of arcs to be dealt with, and the handling isn't any different than for other colors that contain too many characters to be dealt with individually. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
Diffstat (limited to 'src/backend/regex/regcomp.c')
-rw-r--r--src/backend/regex/regcomp.c12
1 files changed, 8 insertions, 4 deletions
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index cd0caaa2d03..ae8dbe58191 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -158,7 +158,8 @@ static int push(struct nfa *, struct arc *, struct state **);
#define INCOMPATIBLE 1 /* destroys arc */
#define SATISFIED 2 /* constraint satisfied */
#define COMPATIBLE 3 /* compatible but not satisfied yet */
-static int combine(struct arc *, struct arc *);
+#define REPLACEARC 4 /* replace arc's color with constraint color */
+static int combine(struct nfa *nfa, struct arc *con, struct arc *a);
static void fixempties(struct nfa *, FILE *);
static struct state *emptyreachable(struct nfa *, struct state *,
struct state *, struct arc **);
@@ -289,9 +290,11 @@ struct vars
#define SBEGIN 'A' /* beginning of string (even if not BOL) */
#define SEND 'Z' /* end of string (even if not EOL) */
-/* is an arc colored, and hence on a color chain? */
+/* is an arc colored, and hence should belong to a color chain? */
+/* the test on "co" eliminates RAINBOW arcs, which we don't bother to chain */
#define COLORED(a) \
- ((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND)
+ ((a)->co >= 0 && \
+ ((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND))
/* static function list */
@@ -1393,7 +1396,8 @@ bracket(struct vars *v,
* cbracket - handle complemented bracket expression
* We do it by calling bracket() with dummy endpoints, and then complementing
* the result. The alternative would be to invoke rainbow(), and then delete
- * arcs as the b.e. is seen... but that gets messy.
+ * arcs as the b.e. is seen... but that gets messy, and is really quite
+ * infeasible now that rainbow() just puts out one RAINBOW arc.
*/
static void
cbracket(struct vars *v,