Avoid determining regexp subexpression matches, when possible.

Identifying the precise match locations for parenthesized subexpressions is a fairly expensive task given the way our regexp engine works, both at regexp compile time (where we must create an optimized NFA for each parenthesized subexpression) and at runtime (where determining exact match locations requires laborious search). Up to now we've made little attempt to optimize this situation. This patch identifies cases where we know at compile time that we won't need to know subexpression match locations, and teaches the regexp compiler to not bother creating per-subexpression regexps for parenthesis pairs that are not referenced by backrefs elsewhere in the regexp. (To preserve semantics, we obviously still have to pin down the match locations of backref references.) Users could have obtained the same results before this by being careful to write "non capturing" parentheses wherever possible, but few people bother with that. Discussion: https://postgr.es/m/2219936.1628115334@sss.pgh.pa.us
author: Tom Lane <tgl@sss.pgh.pa.us> 2021-08-09 11:26:34 -0400
committer: Tom Lane <tgl@sss.pgh.pa.us> 2021-08-09 11:26:34 -0400
commit: 0e6aa8747d439bb7f08f95e358f0509c50396785 (patch)
tree: 64977d63a54b4f5c266692966282c6d7aab3aba7 /src/test/modules/test_regex/sql/test_regex.sql
parent: 76ad24400d73fa10d527844d50bedf7dacb1e87b (diff)
download: postgresql-0e6aa8747d439bb7f08f95e358f0509c50396785.tar.gz
postgresql-0e6aa8747d439bb7f08f95e358f0509c50396785.zip
1 files changed, 11 insertions, 2 deletions
diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql
index 9224fdfdd3a..3419564203a 100644
--- a/src/test/modules/test_regex/sql/test_regex.sql
+++ b/src/test/modules/test_regex/sql/test_regex.sql
@@ -63,8 +63,8 @@ select * from test_regex('ab', 'ab', 'b');
 
 -- expectMatch	4.1  -		(a)e		ae	ae	a
 select * from test_regex('(a)e', 'ae', '-');
--- expectMatch	4.2  o		(a)e		ae
-select * from test_regex('(a)e', 'ae', 'o');
+-- expectMatch	4.2  oPR	(.)\1e		abeaae	aae	{}
+select * from test_regex('(.)\1e', 'abeaae', 'oPR');
 -- expectMatch	4.3  b		{\(a\)b}	ab	ab	a
 select * from test_regex('\(a\)b', 'ab', 'b');
 -- expectMatch	4.4  -		a((b)c)		abc	abc	bc	b
@@ -775,6 +775,11 @@ select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP');
 select * from test_regex('(^\w+\M).*\1', 'abc abcd abd', 'LRP');
 select * from test_regex('(\w+(?= )).*\1', 'abc abcd abd', 'HLRP');
 
+-- exercise oversize-regmatch_t-array paths in regexec()
+-- (that case is not reachable via test_regex, sadly)
+select substring('fffoooooooooooooooooooooooooooooooo', '^(.)\1(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)');
+select regexp_split_to_array('abcxxxdefyyyghi', '((.))(\1\2)');
+
 -- doing 15 "octal escapes vs back references"
 
 -- # initial zero is always octal
@@ -1011,6 +1016,10 @@ select * from test_regex('(a*)*', 'bc', 'N');
 select * from test_regex(' TO (([a-z0-9._]+|"([^"]+|"")+")+)', 'asd TO foo', 'M');
 -- expectMatch	21.36 RPQ	((.))(\2){0}	xy	x	x	x	{}
 select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
+-- expectMatch	21.37 RP	((.))(\2)	xyy	yy	y	y	y
+select * from test_regex('((.))(\2)', 'xyy', 'RP');
+-- expectMatch	21.38 oRP	((.))(\2)	xyy	yy	{}	{}	{}
+select * from test_regex('((.))(\2)', 'xyy', 'oRP');
 
 -- doing 22 "multicharacter collating elements"
 -- # again ugh
author	Tom Lane <tgl@sss.pgh.pa.us>	2021-08-09 11:26:34 -0400
committer	Tom Lane <tgl@sss.pgh.pa.us>	2021-08-09 11:26:34 -0400
commit	0e6aa8747d439bb7f08f95e358f0509c50396785 (patch)
tree	64977d63a54b4f5c266692966282c6d7aab3aba7 /src/test/modules/test_regex/sql/test_regex.sql
parent	76ad24400d73fa10d527844d50bedf7dacb1e87b (diff)
download	postgresql-0e6aa8747d439bb7f08f95e358f0509c50396785.tar.gz postgresql-0e6aa8747d439bb7f08f95e358f0509c50396785.zip