diff options
Diffstat (limited to 'src/backend/utils')
-rw-r--r-- | src/backend/utils/adt/like.c | 26 | ||||
-rw-r--r-- | src/backend/utils/adt/like_match.c | 149 | ||||
-rw-r--r-- | src/backend/utils/adt/like_support.c | 29 |
3 files changed, 176 insertions, 28 deletions
diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index 0152723b2a6..7b3d1b5be71 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -147,22 +147,28 @@ SB_lower_char(unsigned char c, pg_locale_t locale) static inline int GenericMatchText(const char *s, int slen, const char *p, int plen, Oid collation) { - if (collation) - { - pg_locale_t locale = pg_newlocale_from_collation(collation); + pg_locale_t locale; - if (!locale->deterministic) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("nondeterministic collations are not supported for LIKE"))); + if (!OidIsValid(collation)) + { + /* + * This typically means that the parser could not resolve a conflict + * of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for LIKE"), + errhint("Use the COLLATE clause to set the collation explicitly."))); } + locale = pg_newlocale_from_collation(collation); + if (pg_database_encoding_max_length() == 1) - return SB_MatchText(s, slen, p, plen, 0); + return SB_MatchText(s, slen, p, plen, locale); else if (GetDatabaseEncoding() == PG_UTF8) - return UTF8_MatchText(s, slen, p, plen, 0); + return UTF8_MatchText(s, slen, p, plen, locale); else - return MB_MatchText(s, slen, p, plen, 0); + return MB_MatchText(s, slen, p, plen, locale); } static inline int diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c index f561cc15e4c..afe5406cf40 100644 --- a/src/backend/utils/adt/like_match.c +++ b/src/backend/utils/adt/like_match.c @@ -157,7 +157,9 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale) * the first pattern byte to each text byte to avoid recursing * more than we have to. This fact also guarantees that we don't * have to consider a match to the zero-length substring at the - * end of the text. + * end of the text. With a nondeterministic collation, we can't + * rely on the first bytes being equal, so we have to recurse in + * any case. */ if (*p == '\\') { @@ -172,7 +174,7 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale) while (tlen > 0) { - if (GETCHAR(*t, locale) == firstpat) + if (GETCHAR(*t, locale) == firstpat || (locale && !locale->deterministic)) { int matched = MatchText(t, tlen, p, plen, locale); @@ -196,6 +198,149 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale) NextByte(p, plen); continue; } + else if (locale && !locale->deterministic) + { + /* + * For nondeterministic locales, we find the next substring of the + * pattern that does not contain wildcards and try to find a + * matching substring in the text. Crucially, we cannot do this + * character by character, as in the normal case, but must do it + * substring by substring, partitioned by the wildcard characters. + * (This is per SQL standard.) + */ + const char *p1; + size_t p1len; + const char *t1; + size_t t1len; + bool found_escape; + const char *subpat; + size_t subpatlen; + char *buf = NULL; + + /* + * Determine next substring of pattern without wildcards. p is + * the start of the subpattern, p1 is one past the last byte. Also + * track if we found an escape character. + */ + p1 = p; + p1len = plen; + found_escape = false; + while (p1len > 0) + { + if (*p1 == '\\') + { + found_escape = true; + NextByte(p1, p1len); + if (p1len == 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), + errmsg("LIKE pattern must not end with escape character"))); + } + else if (*p1 == '_' || *p1 == '%') + break; + NextByte(p1, p1len); + } + + /* + * If we found an escape character, then make an unescaped copy of + * the subpattern. + */ + if (found_escape) + { + char *b; + + b = buf = palloc(p1 - p); + for (const char *c = p; c < p1; c++) + { + if (*c == '\\') + ; + else + *(b++) = *c; + } + + subpat = buf; + subpatlen = b - buf; + } + else + { + subpat = p; + subpatlen = p1 - p; + } + + /* + * Shortcut: If this is the end of the pattern, then the rest of + * the text has to match the rest of the pattern. + */ + if (p1len == 0) + { + int cmp; + + cmp = pg_strncoll(subpat, subpatlen, t, tlen, locale); + + if (buf) + pfree(buf); + if (cmp == 0) + return LIKE_TRUE; + else + return LIKE_FALSE; + } + + /* + * Now build a substring of the text and try to match it against + * the subpattern. t is the start of the text, t1 is one past the + * last byte. We start with a zero-length string. + */ + t1 = t; + t1len = tlen; + for (;;) + { + int cmp; + + CHECK_FOR_INTERRUPTS(); + + cmp = pg_strncoll(subpat, subpatlen, t, (t1 - t), locale); + + /* + * If we found a match, we have to test if the rest of pattern + * can match against the rest of the string. Otherwise we + * have to continue here try matching with a longer substring. + * (This is similar to the recursion for the '%' wildcard + * above.) + * + * Note that we can't just wind forward p and t and continue + * with the main loop. This would fail for example with + * + * U&'\0061\0308bc' LIKE U&'\00E4_c' COLLATE ignore_accents + * + * You'd find that t=\0061 matches p=\00E4, but then the rest + * won't match; but t=\0061\0308 also matches p=\00E4, and + * then the rest will match. + */ + if (cmp == 0) + { + int matched = MatchText(t1, t1len, p1, p1len, locale); + + if (matched == LIKE_TRUE) + { + if (buf) + pfree(buf); + return matched; + } + } + + /* + * Didn't match. If we used up the whole text, then the match + * fails. Otherwise, try again with a longer substring. + */ + if (t1len == 0) + return LIKE_FALSE; + else + NextChar(t1, t1len); + } + if (buf) + pfree(buf); + continue; + } else if (GETCHAR(*p, locale) != GETCHAR(*t, locale)) { /* non-wildcard pattern char fails to match text char */ diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c index 8b15509a3bf..ee71ca89ffd 100644 --- a/src/backend/utils/adt/like_support.c +++ b/src/backend/utils/adt/like_support.c @@ -273,22 +273,6 @@ match_pattern_prefix(Node *leftop, patt = (Const *) rightop; /* - * Not supported if the expression collation is nondeterministic. The - * optimized equality or prefix tests use bytewise comparisons, which is - * not consistent with nondeterministic collations. The actual - * pattern-matching implementation functions will later error out that - * pattern-matching is not supported with nondeterministic collations. (We - * could also error out here, but by doing it later we get more precise - * error messages.) (It should be possible to support at least - * Pattern_Prefix_Exact, but no point as long as the actual - * pattern-matching implementations don't support it.) - * - * expr_coll is not set for a non-collation-aware data type such as bytea. - */ - if (expr_coll && !get_collation_isdeterministic(expr_coll)) - return NIL; - - /* * Try to extract a fixed prefix from the pattern. */ pstatus = pattern_fixed_prefix(patt, ptype, expr_coll, @@ -404,6 +388,8 @@ match_pattern_prefix(Node *leftop, { if (!op_in_opfamily(eqopr, opfamily)) return NIL; + if (indexcollation != expr_coll) + return NIL; expr = make_opclause(eqopr, BOOLOID, false, (Expr *) leftop, (Expr *) prefix, InvalidOid, indexcollation); @@ -412,6 +398,17 @@ match_pattern_prefix(Node *leftop, } /* + * Anything other than Pattern_Prefix_Exact is not supported if the + * expression collation is nondeterministic. The optimized equality or + * prefix tests use bytewise comparisons, which is not consistent with + * nondeterministic collations. + * + * expr_coll is not set for a non-collation-aware data type such as bytea. + */ + if (expr_coll && !get_collation_isdeterministic(expr_coll)) + return NIL; + + /* * Otherwise, we have a nonempty required prefix of the values. Some * opclasses support prefix checks directly, otherwise we'll try to * generate a range constraint. |