aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2019-05-12 18:53:13 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2019-05-12 18:53:43 -0400
commitd6de52636827ab64f537eb78657db125b082bb65 (patch)
treedcbfe5b223a2325e14b969705a888296f2207fac
parent9dc65bcf9dece8aa92f554db51112645c9751c9e (diff)
downloadpostgresql-d6de52636827ab64f537eb78657db125b082bb65.tar.gz
postgresql-d6de52636827ab64f537eb78657db125b082bb65.zip
Fix misoptimization of "{1,1}" quantifiers in regular expressions.
A bounded quantifier with m = n = 1 might be thought a no-op. But according to our documentation (which traces back to Henry Spencer's original man page) it still imposes greediness, or non-greediness in the case of the non-greedy variant "{1,1}?", on whatever it's attached to. This turns out not to work though, because parseqatom() optimizes away the m = n = 1 case without regard for whether it's supposed to change the greediness of the argument RE. We can fix this by just not applying the optimization when the greediness needs to change; the subsequent general cases handle it fine. The three cases in which we can still apply the optimization are (a) no quantifier, or quantifier does not impose a preference; (b) atom has no greediness property, implying it cannot match a variable amount of text anyway; or (c) quantifier's greediness is same as atom's. Note that in most cases where one of these applies, we'd have exited earlier in the "not a messy case" fast path. I think it's now only possible to get to the optimization when the atom involves capturing parentheses or a non-top-level backref. Back-patch to all supported branches. I'd ordinarily be hesitant to put a subtle behavioral change into back branches, but in this case it's very hard to see a reason why somebody would write "{1,1}?" unless they're trying to get the documented change-of-greediness behavior. Discussion: https://postgr.es/m/5bb27a41-350d-37bf-901e-9d26f5592dd0@charter.net
-rw-r--r--src/backend/regex/regcomp.c5
-rw-r--r--src/test/regress/expected/regex.out49
-rw-r--r--src/test/regress/sql/regex.sql10
3 files changed, 63 insertions, 1 deletions
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 7ae9673a7d5..f72c41692bb 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -1147,7 +1147,10 @@ parseqatom(struct vars * v,
/* rest of branch can be strung starting from atom->end */
s2 = atom->end;
}
- else if (m == 1 && n == 1)
+ else if (m == 1 && n == 1 &&
+ (qprefer == 0 ||
+ (atom->flags & (LONGER | SHORTER | MIXED)) == 0 ||
+ qprefer == (atom->flags & (LONGER | SHORTER | MIXED))))
{
/* no/vacuous quantifier: done */
EMPTYARC(s, atom->begin); /* empty prefix */
diff --git a/src/test/regress/expected/regex.out b/src/test/regress/expected/regex.out
index 2b4f2ec2522..e5a4777f1bb 100644
--- a/src/test/regress/expected/regex.out
+++ b/src/test/regress/expected/regex.out
@@ -295,6 +295,55 @@ select regexp_matches('foo/bar/baz',
{foo,bar,baz}
(1 row)
+-- Test that greediness can be overridden by outer quantifier
+select regexp_matches('llmmmfff', '^(l*)(.*)(f*)$');
+ regexp_matches
+----------------
+ {ll,mmmfff,""}
+(1 row)
+
+select regexp_matches('llmmmfff', '^(l*){1,1}(.*)(f*)$');
+ regexp_matches
+----------------
+ {ll,mmmfff,""}
+(1 row)
+
+select regexp_matches('llmmmfff', '^(l*){1,1}?(.*)(f*)$');
+ regexp_matches
+------------------
+ {"",llmmmfff,""}
+(1 row)
+
+select regexp_matches('llmmmfff', '^(l*){1,1}?(.*){1,1}?(f*)$');
+ regexp_matches
+----------------
+ {"",llmmm,fff}
+(1 row)
+
+select regexp_matches('llmmmfff', '^(l*?)(.*)(f*)$');
+ regexp_matches
+------------------
+ {"",llmmmfff,""}
+(1 row)
+
+select regexp_matches('llmmmfff', '^(l*?){1,1}(.*)(f*)$');
+ regexp_matches
+----------------
+ {ll,mmmfff,""}
+(1 row)
+
+select regexp_matches('llmmmfff', '^(l*?){1,1}?(.*)(f*)$');
+ regexp_matches
+------------------
+ {"",llmmmfff,""}
+(1 row)
+
+select regexp_matches('llmmmfff', '^(l*?){1,1}?(.*){1,1}?(f*)$');
+ regexp_matches
+----------------
+ {"",llmmm,fff}
+(1 row)
+
-- Test for infinite loop in cfindloop with zero-length possible match
-- but no actual match (can only happen in the presence of backrefs)
select 'a' ~ '$()|^\1';
diff --git a/src/test/regress/sql/regex.sql b/src/test/regress/sql/regex.sql
index 635f068eae9..9c2e9c54ad8 100644
--- a/src/test/regress/sql/regex.sql
+++ b/src/test/regress/sql/regex.sql
@@ -76,6 +76,16 @@ select regexp_matches('Programmer', '(\w)(.*?\1)', 'g');
select regexp_matches('foo/bar/baz',
'^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$', '');
+-- Test that greediness can be overridden by outer quantifier
+select regexp_matches('llmmmfff', '^(l*)(.*)(f*)$');
+select regexp_matches('llmmmfff', '^(l*){1,1}(.*)(f*)$');
+select regexp_matches('llmmmfff', '^(l*){1,1}?(.*)(f*)$');
+select regexp_matches('llmmmfff', '^(l*){1,1}?(.*){1,1}?(f*)$');
+select regexp_matches('llmmmfff', '^(l*?)(.*)(f*)$');
+select regexp_matches('llmmmfff', '^(l*?){1,1}(.*)(f*)$');
+select regexp_matches('llmmmfff', '^(l*?){1,1}?(.*)(f*)$');
+select regexp_matches('llmmmfff', '^(l*?){1,1}?(.*){1,1}?(f*)$');
+
-- Test for infinite loop in cfindloop with zero-length possible match
-- but no actual match (can only happen in the presence of backrefs)
select 'a' ~ '$()|^\1';