aboutsummaryrefslogtreecommitdiff
path: root/src/include/regex/regguts.h
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2021-02-20 19:26:41 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2021-02-20 19:26:41 -0500
commitea1268f6301cc7adce571cc9c5ebe8d9342a2ef4 (patch)
tree036b342d06097d3118a31dc9b3f65a5756c7730c /src/include/regex/regguts.h
parent581043089472816061a7fd381f40572191dfa48f (diff)
downloadpostgresql-ea1268f6301cc7adce571cc9c5ebe8d9342a2ef4.tar.gz
postgresql-ea1268f6301cc7adce571cc9c5ebe8d9342a2ef4.zip
Avoid generating extra subre tree nodes for capturing parentheses.
Previously, each pair of capturing parentheses gave rise to a separate subre tree node, whose only function was to identify that we ought to capture the match details for this particular sub-expression. In most cases we don't really need that, since we can perfectly well put a "capture this" annotation on the child node that does the real matching work. As with the two preceding commits, the main value of this is to avoid generating and optimizing an NFA for a tree node that's not really pulling its weight. The chosen data representation only allows one capture annotation per subre node. In the legal-per-spec, but seemingly not very useful, case where there are multiple capturing parens around the exact same bit of the regex (i.e. "((xyz))"), wrap the child node in N-1 capture nodes that act the same as before. We could work harder at that but I'll refrain, pending some evidence that such cases are worth troubling over. In passing, improve the comments in regex.h to say what all the different re_info bits mean. Some of them were pretty obvious but others not so much, so reverse-engineer some documentation. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
Diffstat (limited to 'src/include/regex/regguts.h')
-rw-r--r--src/include/regex/regguts.h13
1 files changed, 7 insertions, 6 deletions
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 90ee16957ac..306525eb5fa 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -422,7 +422,7 @@ struct cnfa
* "op" is one of:
* '=' plain regex without interesting substructure (implemented as DFA)
* 'b' back-reference (has no substructure either)
- * '(' capture node: captures the match of its single child
+ * '(' no-op capture node: captures the match of its single child
* '.' concatenation: matches a match for first child, then second child
* '|' alternation: matches a match for any of its children
* '*' iteration: matches some number of matches of its single child
@@ -446,8 +446,8 @@ struct subre
#define LONGER 01 /* prefers longer match */
#define SHORTER 02 /* prefers shorter match */
#define MIXED 04 /* mixed preference below */
-#define CAP 010 /* capturing parens below */
-#define BACKR 020 /* back reference below */
+#define CAP 010 /* capturing parens here or below */
+#define BACKR 020 /* back reference here or below */
#define INUSE 0100 /* in use in final tree */
#define NOPROP 03 /* bits which may not propagate up */
#define LMIX(f) ((f)<<2) /* LONGER -> MIXED */
@@ -457,9 +457,10 @@ struct subre
#define PREF(f) ((f)&NOPROP)
#define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
- short id; /* ID of subre (1..ntree-1) */
- int subno; /* subexpression number for 'b' and '(', or
- * LATYPE code for lookaround constraint */
+ char latype; /* LATYPE code, if lookaround constraint */
+ int id; /* ID of subre (1..ntree-1) */
+ int capno; /* if capture node, subno to capture into */
+ int backno; /* if backref node, subno it refers to */
short min; /* min repetitions for iteration or backref */
short max; /* max repetitions for iteration or backref */
struct subre *child; /* first child, if any (also freelist chain) */