aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils')
-rw-r--r--src/backend/utils/adt/like.c26
-rw-r--r--src/backend/utils/adt/like_match.c149
-rw-r--r--src/backend/utils/adt/like_support.c29
3 files changed, 176 insertions, 28 deletions
diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c
index 0152723b2a6..7b3d1b5be71 100644
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@@ -147,22 +147,28 @@ SB_lower_char(unsigned char c, pg_locale_t locale)
static inline int
GenericMatchText(const char *s, int slen, const char *p, int plen, Oid collation)
{
- if (collation)
- {
- pg_locale_t locale = pg_newlocale_from_collation(collation);
+ pg_locale_t locale;
- if (!locale->deterministic)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("nondeterministic collations are not supported for LIKE")));
+ if (!OidIsValid(collation))
+ {
+ /*
+ * This typically means that the parser could not resolve a conflict
+ * of implicit collations, so report it that way.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_INDETERMINATE_COLLATION),
+ errmsg("could not determine which collation to use for LIKE"),
+ errhint("Use the COLLATE clause to set the collation explicitly.")));
}
+ locale = pg_newlocale_from_collation(collation);
+
if (pg_database_encoding_max_length() == 1)
- return SB_MatchText(s, slen, p, plen, 0);
+ return SB_MatchText(s, slen, p, plen, locale);
else if (GetDatabaseEncoding() == PG_UTF8)
- return UTF8_MatchText(s, slen, p, plen, 0);
+ return UTF8_MatchText(s, slen, p, plen, locale);
else
- return MB_MatchText(s, slen, p, plen, 0);
+ return MB_MatchText(s, slen, p, plen, locale);
}
static inline int
diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c
index f561cc15e4c..afe5406cf40 100644
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@@ -157,7 +157,9 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
* the first pattern byte to each text byte to avoid recursing
* more than we have to. This fact also guarantees that we don't
* have to consider a match to the zero-length substring at the
- * end of the text.
+ * end of the text. With a nondeterministic collation, we can't
+ * rely on the first bytes being equal, so we have to recurse in
+ * any case.
*/
if (*p == '\\')
{
@@ -172,7 +174,7 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
while (tlen > 0)
{
- if (GETCHAR(*t, locale) == firstpat)
+ if (GETCHAR(*t, locale) == firstpat || (locale && !locale->deterministic))
{
int matched = MatchText(t, tlen, p, plen, locale);
@@ -196,6 +198,149 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
NextByte(p, plen);
continue;
}
+ else if (locale && !locale->deterministic)
+ {
+ /*
+ * For nondeterministic locales, we find the next substring of the
+ * pattern that does not contain wildcards and try to find a
+ * matching substring in the text. Crucially, we cannot do this
+ * character by character, as in the normal case, but must do it
+ * substring by substring, partitioned by the wildcard characters.
+ * (This is per SQL standard.)
+ */
+ const char *p1;
+ size_t p1len;
+ const char *t1;
+ size_t t1len;
+ bool found_escape;
+ const char *subpat;
+ size_t subpatlen;
+ char *buf = NULL;
+
+ /*
+ * Determine next substring of pattern without wildcards. p is
+ * the start of the subpattern, p1 is one past the last byte. Also
+ * track if we found an escape character.
+ */
+ p1 = p;
+ p1len = plen;
+ found_escape = false;
+ while (p1len > 0)
+ {
+ if (*p1 == '\\')
+ {
+ found_escape = true;
+ NextByte(p1, p1len);
+ if (p1len == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
+ errmsg("LIKE pattern must not end with escape character")));
+ }
+ else if (*p1 == '_' || *p1 == '%')
+ break;
+ NextByte(p1, p1len);
+ }
+
+ /*
+ * If we found an escape character, then make an unescaped copy of
+ * the subpattern.
+ */
+ if (found_escape)
+ {
+ char *b;
+
+ b = buf = palloc(p1 - p);
+ for (const char *c = p; c < p1; c++)
+ {
+ if (*c == '\\')
+ ;
+ else
+ *(b++) = *c;
+ }
+
+ subpat = buf;
+ subpatlen = b - buf;
+ }
+ else
+ {
+ subpat = p;
+ subpatlen = p1 - p;
+ }
+
+ /*
+ * Shortcut: If this is the end of the pattern, then the rest of
+ * the text has to match the rest of the pattern.
+ */
+ if (p1len == 0)
+ {
+ int cmp;
+
+ cmp = pg_strncoll(subpat, subpatlen, t, tlen, locale);
+
+ if (buf)
+ pfree(buf);
+ if (cmp == 0)
+ return LIKE_TRUE;
+ else
+ return LIKE_FALSE;
+ }
+
+ /*
+ * Now build a substring of the text and try to match it against
+ * the subpattern. t is the start of the text, t1 is one past the
+ * last byte. We start with a zero-length string.
+ */
+ t1 = t;
+ t1len = tlen;
+ for (;;)
+ {
+ int cmp;
+
+ CHECK_FOR_INTERRUPTS();
+
+ cmp = pg_strncoll(subpat, subpatlen, t, (t1 - t), locale);
+
+ /*
+ * If we found a match, we have to test if the rest of pattern
+ * can match against the rest of the string. Otherwise we
+ * have to continue here try matching with a longer substring.
+ * (This is similar to the recursion for the '%' wildcard
+ * above.)
+ *
+ * Note that we can't just wind forward p and t and continue
+ * with the main loop. This would fail for example with
+ *
+ * U&'\0061\0308bc' LIKE U&'\00E4_c' COLLATE ignore_accents
+ *
+ * You'd find that t=\0061 matches p=\00E4, but then the rest
+ * won't match; but t=\0061\0308 also matches p=\00E4, and
+ * then the rest will match.
+ */
+ if (cmp == 0)
+ {
+ int matched = MatchText(t1, t1len, p1, p1len, locale);
+
+ if (matched == LIKE_TRUE)
+ {
+ if (buf)
+ pfree(buf);
+ return matched;
+ }
+ }
+
+ /*
+ * Didn't match. If we used up the whole text, then the match
+ * fails. Otherwise, try again with a longer substring.
+ */
+ if (t1len == 0)
+ return LIKE_FALSE;
+ else
+ NextChar(t1, t1len);
+ }
+ if (buf)
+ pfree(buf);
+ continue;
+ }
else if (GETCHAR(*p, locale) != GETCHAR(*t, locale))
{
/* non-wildcard pattern char fails to match text char */
diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c
index 8b15509a3bf..ee71ca89ffd 100644
--- a/src/backend/utils/adt/like_support.c
+++ b/src/backend/utils/adt/like_support.c
@@ -273,22 +273,6 @@ match_pattern_prefix(Node *leftop,
patt = (Const *) rightop;
/*
- * Not supported if the expression collation is nondeterministic. The
- * optimized equality or prefix tests use bytewise comparisons, which is
- * not consistent with nondeterministic collations. The actual
- * pattern-matching implementation functions will later error out that
- * pattern-matching is not supported with nondeterministic collations. (We
- * could also error out here, but by doing it later we get more precise
- * error messages.) (It should be possible to support at least
- * Pattern_Prefix_Exact, but no point as long as the actual
- * pattern-matching implementations don't support it.)
- *
- * expr_coll is not set for a non-collation-aware data type such as bytea.
- */
- if (expr_coll && !get_collation_isdeterministic(expr_coll))
- return NIL;
-
- /*
* Try to extract a fixed prefix from the pattern.
*/
pstatus = pattern_fixed_prefix(patt, ptype, expr_coll,
@@ -404,6 +388,8 @@ match_pattern_prefix(Node *leftop,
{
if (!op_in_opfamily(eqopr, opfamily))
return NIL;
+ if (indexcollation != expr_coll)
+ return NIL;
expr = make_opclause(eqopr, BOOLOID, false,
(Expr *) leftop, (Expr *) prefix,
InvalidOid, indexcollation);
@@ -412,6 +398,17 @@ match_pattern_prefix(Node *leftop,
}
/*
+ * Anything other than Pattern_Prefix_Exact is not supported if the
+ * expression collation is nondeterministic. The optimized equality or
+ * prefix tests use bytewise comparisons, which is not consistent with
+ * nondeterministic collations.
+ *
+ * expr_coll is not set for a non-collation-aware data type such as bytea.
+ */
+ if (expr_coll && !get_collation_isdeterministic(expr_coll))
+ return NIL;
+
+ /*
* Otherwise, we have a nonempty required prefix of the values. Some
* opclasses support prefix checks directly, otherwise we'll try to
* generate a range constraint.