3 files changed, 176 insertions, 28 deletions
diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c
index 0152723b2a6..7b3d1b5be71 100644
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@@ -147,22 +147,28 @@ SB_lower_char(unsigned char c, pg_locale_t locale)
 static inline int
 GenericMatchText(const char *s, int slen, const char *p, int plen, Oid collation)
 {
-	if (collation)
-	{
-		pg_locale_t locale = pg_newlocale_from_collation(collation);
+	pg_locale_t locale;
 
-		if (!locale->deterministic)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("nondeterministic collations are not supported for LIKE")));
+	if (!OidIsValid(collation))
+	{
+		/*
+		 * This typically means that the parser could not resolve a conflict
+		 * of implicit collations, so report it that way.
+		 */
+		ereport(ERROR,
+				(errcode(ERRCODE_INDETERMINATE_COLLATION),
+				 errmsg("could not determine which collation to use for LIKE"),
+				 errhint("Use the COLLATE clause to set the collation explicitly.")));
 	}
 
+	locale = pg_newlocale_from_collation(collation);
+
 	if (pg_database_encoding_max_length() == 1)
-		return SB_MatchText(s, slen, p, plen, 0);
+		return SB_MatchText(s, slen, p, plen, locale);
 	else if (GetDatabaseEncoding() == PG_UTF8)
-		return UTF8_MatchText(s, slen, p, plen, 0);
+		return UTF8_MatchText(s, slen, p, plen, locale);
 	else
-		return MB_MatchText(s, slen, p, plen, 0);
+		return MB_MatchText(s, slen, p, plen, locale);
 }
 
 static inline int
diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c
index f561cc15e4c..afe5406cf40 100644
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@@ -157,7 +157,9 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 			 * the first pattern byte to each text byte to avoid recursing
 			 * more than we have to.  This fact also guarantees that we don't
 			 * have to consider a match to the zero-length substring at the
-			 * end of the text.
+			 * end of the text.  With a nondeterministic collation, we can't
+			 * rely on the first bytes being equal, so we have to recurse in
+			 * any case.
 			 */
 			if (*p == '\\')
 			{
@@ -172,7 +174,7 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 
 			while (tlen > 0)
 			{
-				if (GETCHAR(*t, locale) == firstpat)
+				if (GETCHAR(*t, locale) == firstpat || (locale && !locale->deterministic))
 				{
 					int			matched = MatchText(t, tlen, p, plen, locale);
 
@@ -196,6 +198,149 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 			NextByte(p, plen);
 			continue;
 		}
+		else if (locale && !locale->deterministic)
+		{
+			/*
+			 * For nondeterministic locales, we find the next substring of the
+			 * pattern that does not contain wildcards and try to find a
+			 * matching substring in the text.  Crucially, we cannot do this
+			 * character by character, as in the normal case, but must do it
+			 * substring by substring, partitioned by the wildcard characters.
+			 * (This is per SQL standard.)
+			 */
+			const char *p1;
+			size_t		p1len;
+			const char *t1;
+			size_t		t1len;
+			bool		found_escape;
+			const char *subpat;
+			size_t		subpatlen;
+			char	   *buf = NULL;
+
+			/*
+			 * Determine next substring of pattern without wildcards.  p is
+			 * the start of the subpattern, p1 is one past the last byte. Also
+			 * track if we found an escape character.
+			 */
+			p1 = p;
+			p1len = plen;
+			found_escape = false;
+			while (p1len > 0)
+			{
+				if (*p1 == '\\')
+				{
+					found_escape = true;
+					NextByte(p1, p1len);
+					if (p1len == 0)
+						ereport(ERROR,
+								(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
+								 errmsg("LIKE pattern must not end with escape character")));
+				}
+				else if (*p1 == '_' || *p1 == '%')
+					break;
+				NextByte(p1, p1len);
+			}
+
+			/*
+			 * If we found an escape character, then make an unescaped copy of
+			 * the subpattern.
+			 */
+			if (found_escape)
+			{
+				char	   *b;
+
+				b = buf = palloc(p1 - p);
+				for (const char *c = p; c < p1; c++)
+				{
+					if (*c == '\\')
+						;
+					else
+						*(b++) = *c;
+				}
+
+				subpat = buf;
+				subpatlen = b - buf;
+			}
+			else
+			{
+				subpat = p;
+				subpatlen = p1 - p;
+			}
+
+			/*
+			 * Shortcut: If this is the end of the pattern, then the rest of
+			 * the text has to match the rest of the pattern.
+			 */
+			if (p1len == 0)
+			{
+				int			cmp;
+
+				cmp = pg_strncoll(subpat, subpatlen, t, tlen, locale);
+
+				if (buf)
+					pfree(buf);
+				if (cmp == 0)
+					return LIKE_TRUE;
+				else
+					return LIKE_FALSE;
+			}
+
+			/*
+			 * Now build a substring of the text and try to match it against
+			 * the subpattern.  t is the start of the text, t1 is one past the
+			 * last byte.  We start with a zero-length string.
+			 */
+			t1 = t;
+			t1len = tlen;
+			for (;;)
+			{
+				int			cmp;
+
+				CHECK_FOR_INTERRUPTS();
+
+				cmp = pg_strncoll(subpat, subpatlen, t, (t1 - t), locale);
+
+				/*
+				 * If we found a match, we have to test if the rest of pattern
+				 * can match against the rest of the string.  Otherwise we
+				 * have to continue here try matching with a longer substring.
+				 * (This is similar to the recursion for the '%' wildcard
+				 * above.)
+				 *
+				 * Note that we can't just wind forward p and t and continue
+				 * with the main loop.  This would fail for example with
+				 *
+				 * U&'\0061\0308bc' LIKE U&'\00E4_c' COLLATE ignore_accents
+				 *
+				 * You'd find that t=\0061 matches p=\00E4, but then the rest
+				 * won't match; but t=\0061\0308 also matches p=\00E4, and
+				 * then the rest will match.
+				 */
+				if (cmp == 0)
+				{
+					int			matched = MatchText(t1, t1len, p1, p1len, locale);
+
+					if (matched == LIKE_TRUE)
+					{
+						if (buf)
+							pfree(buf);
+						return matched;
+					}
+				}
+
+				/*
+				 * Didn't match.  If we used up the whole text, then the match
+				 * fails.  Otherwise, try again with a longer substring.
+				 */
+				if (t1len == 0)
+					return LIKE_FALSE;
+				else
+					NextChar(t1, t1len);
+			}
+			if (buf)
+				pfree(buf);
+			continue;
+		}
 		else if (GETCHAR(*p, locale) != GETCHAR(*t, locale))
 		{
 			/* non-wildcard pattern char fails to match text char */
diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c
index 8b15509a3bf..ee71ca89ffd 100644
--- a/src/backend/utils/adt/like_support.c
+++ b/src/backend/utils/adt/like_support.c
@@ -273,22 +273,6 @@ match_pattern_prefix(Node *leftop,
 	patt = (Const *) rightop;
 
 	/*
-	 * Not supported if the expression collation is nondeterministic.  The
-	 * optimized equality or prefix tests use bytewise comparisons, which is
-	 * not consistent with nondeterministic collations.  The actual
-	 * pattern-matching implementation functions will later error out that
-	 * pattern-matching is not supported with nondeterministic collations. (We
-	 * could also error out here, but by doing it later we get more precise
-	 * error messages.)  (It should be possible to support at least
-	 * Pattern_Prefix_Exact, but no point as long as the actual
-	 * pattern-matching implementations don't support it.)
-	 *
-	 * expr_coll is not set for a non-collation-aware data type such as bytea.
-	 */
-	if (expr_coll && !get_collation_isdeterministic(expr_coll))
-		return NIL;
-
-	/*
 	 * Try to extract a fixed prefix from the pattern.
 	 */
 	pstatus = pattern_fixed_prefix(patt, ptype, expr_coll,
@@ -404,6 +388,8 @@ match_pattern_prefix(Node *leftop,
 	{
 		if (!op_in_opfamily(eqopr, opfamily))
 			return NIL;
+		if (indexcollation != expr_coll)
+			return NIL;
 		expr = make_opclause(eqopr, BOOLOID, false,
 							 (Expr *) leftop, (Expr *) prefix,
 							 InvalidOid, indexcollation);
@@ -412,6 +398,17 @@ match_pattern_prefix(Node *leftop,
 	}
 
 	/*
+	 * Anything other than Pattern_Prefix_Exact is not supported if the
+	 * expression collation is nondeterministic.  The optimized equality or
+	 * prefix tests use bytewise comparisons, which is not consistent with
+	 * nondeterministic collations.
+	 *
+	 * expr_coll is not set for a non-collation-aware data type such as bytea.
+	 */
+	if (expr_coll && !get_collation_isdeterministic(expr_coll))
+		return NIL;
+
+	/*
 	 * Otherwise, we have a nonempty required prefix of the values.  Some
 	 * opclasses support prefix checks directly, otherwise we'll try to
 	 * generate a range constraint.