aboutsummaryrefslogtreecommitdiff
path: root/src/include/regex/regguts.h
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2021-08-09 11:26:34 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2021-08-09 11:26:34 -0400
commit0e6aa8747d439bb7f08f95e358f0509c50396785 (patch)
tree64977d63a54b4f5c266692966282c6d7aab3aba7 /src/include/regex/regguts.h
parent76ad24400d73fa10d527844d50bedf7dacb1e87b (diff)
downloadpostgresql-0e6aa8747d439bb7f08f95e358f0509c50396785.tar.gz
postgresql-0e6aa8747d439bb7f08f95e358f0509c50396785.zip
Avoid determining regexp subexpression matches, when possible.
Identifying the precise match locations for parenthesized subexpressions is a fairly expensive task given the way our regexp engine works, both at regexp compile time (where we must create an optimized NFA for each parenthesized subexpression) and at runtime (where determining exact match locations requires laborious search). Up to now we've made little attempt to optimize this situation. This patch identifies cases where we know at compile time that we won't need to know subexpression match locations, and teaches the regexp compiler to not bother creating per-subexpression regexps for parenthesis pairs that are not referenced by backrefs elsewhere in the regexp. (To preserve semantics, we obviously still have to pin down the match locations of backref references.) Users could have obtained the same results before this by being careful to write "non capturing" parentheses wherever possible, but few people bother with that. Discussion: https://postgr.es/m/2219936.1628115334@sss.pgh.pa.us
Diffstat (limited to 'src/include/regex/regguts.h')
-rw-r--r--src/include/regex/regguts.h7
1 files changed, 4 insertions, 3 deletions
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 8db631c83bb..91a52840c47 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -477,13 +477,14 @@ struct subre
#define MIXED 04 /* mixed preference below */
#define CAP 010 /* capturing parens here or below */
#define BACKR 020 /* back reference here or below */
+#define BRUSE 040 /* is referenced by a back reference */
#define INUSE 0100 /* in use in final tree */
-#define NOPROP 03 /* bits which may not propagate up */
+#define UPPROP (MIXED|CAP|BACKR) /* flags which should propagate up */
#define LMIX(f) ((f)<<2) /* LONGER -> MIXED */
#define SMIX(f) ((f)<<1) /* SHORTER -> MIXED */
-#define UP(f) (((f)&~NOPROP) | (LMIX(f) & SMIX(f) & MIXED))
+#define UP(f) (((f)&UPPROP) | (LMIX(f) & SMIX(f) & MIXED))
#define MESSY(f) ((f)&(MIXED|CAP|BACKR))
-#define PREF(f) ((f)&NOPROP)
+#define PREF(f) ((f)&(LONGER|SHORTER))
#define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
char latype; /* LATYPE code, if lookaround constraint */