diff options
Diffstat (limited to 'src/backend/utils/adt/like_match.c')
-rw-r--r-- | src/backend/utils/adt/like_match.c | 149 |
1 files changed, 147 insertions, 2 deletions
diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c index f561cc15e4c..afe5406cf40 100644 --- a/src/backend/utils/adt/like_match.c +++ b/src/backend/utils/adt/like_match.c @@ -157,7 +157,9 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale) * the first pattern byte to each text byte to avoid recursing * more than we have to. This fact also guarantees that we don't * have to consider a match to the zero-length substring at the - * end of the text. + * end of the text. With a nondeterministic collation, we can't + * rely on the first bytes being equal, so we have to recurse in + * any case. */ if (*p == '\\') { @@ -172,7 +174,7 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale) while (tlen > 0) { - if (GETCHAR(*t, locale) == firstpat) + if (GETCHAR(*t, locale) == firstpat || (locale && !locale->deterministic)) { int matched = MatchText(t, tlen, p, plen, locale); @@ -196,6 +198,149 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale) NextByte(p, plen); continue; } + else if (locale && !locale->deterministic) + { + /* + * For nondeterministic locales, we find the next substring of the + * pattern that does not contain wildcards and try to find a + * matching substring in the text. Crucially, we cannot do this + * character by character, as in the normal case, but must do it + * substring by substring, partitioned by the wildcard characters. + * (This is per SQL standard.) + */ + const char *p1; + size_t p1len; + const char *t1; + size_t t1len; + bool found_escape; + const char *subpat; + size_t subpatlen; + char *buf = NULL; + + /* + * Determine next substring of pattern without wildcards. p is + * the start of the subpattern, p1 is one past the last byte. Also + * track if we found an escape character. + */ + p1 = p; + p1len = plen; + found_escape = false; + while (p1len > 0) + { + if (*p1 == '\\') + { + found_escape = true; + NextByte(p1, p1len); + if (p1len == 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), + errmsg("LIKE pattern must not end with escape character"))); + } + else if (*p1 == '_' || *p1 == '%') + break; + NextByte(p1, p1len); + } + + /* + * If we found an escape character, then make an unescaped copy of + * the subpattern. + */ + if (found_escape) + { + char *b; + + b = buf = palloc(p1 - p); + for (const char *c = p; c < p1; c++) + { + if (*c == '\\') + ; + else + *(b++) = *c; + } + + subpat = buf; + subpatlen = b - buf; + } + else + { + subpat = p; + subpatlen = p1 - p; + } + + /* + * Shortcut: If this is the end of the pattern, then the rest of + * the text has to match the rest of the pattern. + */ + if (p1len == 0) + { + int cmp; + + cmp = pg_strncoll(subpat, subpatlen, t, tlen, locale); + + if (buf) + pfree(buf); + if (cmp == 0) + return LIKE_TRUE; + else + return LIKE_FALSE; + } + + /* + * Now build a substring of the text and try to match it against + * the subpattern. t is the start of the text, t1 is one past the + * last byte. We start with a zero-length string. + */ + t1 = t; + t1len = tlen; + for (;;) + { + int cmp; + + CHECK_FOR_INTERRUPTS(); + + cmp = pg_strncoll(subpat, subpatlen, t, (t1 - t), locale); + + /* + * If we found a match, we have to test if the rest of pattern + * can match against the rest of the string. Otherwise we + * have to continue here try matching with a longer substring. + * (This is similar to the recursion for the '%' wildcard + * above.) + * + * Note that we can't just wind forward p and t and continue + * with the main loop. This would fail for example with + * + * U&'\0061\0308bc' LIKE U&'\00E4_c' COLLATE ignore_accents + * + * You'd find that t=\0061 matches p=\00E4, but then the rest + * won't match; but t=\0061\0308 also matches p=\00E4, and + * then the rest will match. + */ + if (cmp == 0) + { + int matched = MatchText(t1, t1len, p1, p1len, locale); + + if (matched == LIKE_TRUE) + { + if (buf) + pfree(buf); + return matched; + } + } + + /* + * Didn't match. If we used up the whole text, then the match + * fails. Otherwise, try again with a longer substring. + */ + if (t1len == 0) + return LIKE_FALSE; + else + NextChar(t1, t1len); + } + if (buf) + pfree(buf); + continue; + } else if (GETCHAR(*p, locale) != GETCHAR(*t, locale)) { /* non-wildcard pattern char fails to match text char */ |