diff options
author | Michael Paquier <michael@paquier.xyz> | 2025-05-28 08:58:40 +0900 |
---|---|---|
committer | Michael Paquier <michael@paquier.xyz> | 2025-05-28 08:58:40 +0900 |
commit | d46911e584d48ee01d5bf75699a77fbf635c865d (patch) | |
tree | 3e7e39095ad21a728f7bb0667fe48c1249d3c05b /src/backend/utils/adt/regexp.c | |
parent | 3e782ca32225e5d5219251a5a3c06698c1e824f8 (diff) | |
download | postgresql-d46911e584d48ee01d5bf75699a77fbf635c865d.tar.gz postgresql-d46911e584d48ee01d5bf75699a77fbf635c865d.zip |
Fix conversion of SIMILAR TO regexes for character classes
The code that translates SIMILAR TO pattern matching expressions to
POSIX-style regular expressions did not consider that square brackets
can be nested. For example, in an expression like [[:alpha:]%_], the
logic replaced the placeholders '_' and '%' but it should not.
This commit fixes the conversion logic by tracking the nesting level of
square brackets marking character class areas, while considering that
in expressions like []] or [^]] the first closing square bracket is a
regular character. Multiple tests are added to show how the conversions
should or should not apply applied while in a character class area, with
specific cases added for all the characters converted outside character
classes like an opening parenthesis '(', dollar sign '$', etc.
Author: Laurenz Albe <laurenz.albe@cybertec.at>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Reviewed-by: Michael Paquier <michael@paquier.xyz>
Discussion: https://postgr.es/m/16ab039d1af455652bdf4173402ddda145f2c73b.camel@cybertec.at
Backpatch-through: 13
Diffstat (limited to 'src/backend/utils/adt/regexp.c')
-rw-r--r-- | src/backend/utils/adt/regexp.c | 38 |
1 files changed, 32 insertions, 6 deletions
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index edee1f7880b..6e2864cbbda 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -773,8 +773,11 @@ similar_escape_internal(text *pat_text, text *esc_text) int plen, elen; bool afterescape = false; - bool incharclass = false; int nquotes = 0; + int charclass_depth = 0; /* Nesting level of character classes, + * encompassed by square brackets */ + int charclass_start = 0; /* State of the character class start, + * for carets */ p = VARDATA_ANY(pat_text); plen = VARSIZE_ANY_EXHDR(pat_text); @@ -904,7 +907,7 @@ similar_escape_internal(text *pat_text, text *esc_text) /* fast path */ if (afterescape) { - if (pchar == '"' && !incharclass) /* escape-double-quote? */ + if (pchar == '"' && charclass_depth < 1) /* escape-double-quote? */ { /* emit appropriate part separator, per notes above */ if (nquotes == 0) @@ -953,18 +956,41 @@ similar_escape_internal(text *pat_text, text *esc_text) /* SQL escape character; do not send to output */ afterescape = true; } - else if (incharclass) + else if (charclass_depth > 0) { if (pchar == '\\') *r++ = '\\'; *r++ = pchar; - if (pchar == ']') - incharclass = false; + + /* + * Ignore a closing bracket at the start of a character class. + * Such a bracket is taken literally rather than closing the + * class. "charclass_start" is 1 right at the beginning of a + * class and 2 after an initial caret. + */ + if (pchar == ']' && charclass_start > 2) + charclass_depth--; + else if (pchar == '[') + charclass_depth++; + + /* + * If there is a caret right after the opening bracket, it negates + * the character class, but a following closing bracket should + * still be treated as a normal character. That holds only for + * the first caret, so only the values 1 and 2 mean that closing + * brackets should be taken literally. + */ + if (pchar == '^') + charclass_start++; + else + charclass_start = 3; /* definitely past the start */ } else if (pchar == '[') { + /* start of a character class */ *r++ = pchar; - incharclass = true; + charclass_depth++; + charclass_start = 1; } else if (pchar == '%') { |