1 files changed, 147 insertions, 2 deletions
diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c
index f561cc15e4c..afe5406cf40 100644
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@@ -157,7 +157,9 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 			 * the first pattern byte to each text byte to avoid recursing
 			 * more than we have to.  This fact also guarantees that we don't
 			 * have to consider a match to the zero-length substring at the
-			 * end of the text.
+			 * end of the text.  With a nondeterministic collation, we can't
+			 * rely on the first bytes being equal, so we have to recurse in
+			 * any case.
 			 */
 			if (*p == '\\')
 			{
@@ -172,7 +174,7 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 
 			while (tlen > 0)
 			{
-				if (GETCHAR(*t, locale) == firstpat)
+				if (GETCHAR(*t, locale) == firstpat || (locale && !locale->deterministic))
 				{
 					int			matched = MatchText(t, tlen, p, plen, locale);
 
@@ -196,6 +198,149 @@ MatchText(const char *t, int tlen, const char *p, int plen, pg_locale_t locale)
 			NextByte(p, plen);
 			continue;
 		}
+		else if (locale && !locale->deterministic)
+		{
+			/*
+			 * For nondeterministic locales, we find the next substring of the
+			 * pattern that does not contain wildcards and try to find a
+			 * matching substring in the text.  Crucially, we cannot do this
+			 * character by character, as in the normal case, but must do it
+			 * substring by substring, partitioned by the wildcard characters.
+			 * (This is per SQL standard.)
+			 */
+			const char *p1;
+			size_t		p1len;
+			const char *t1;
+			size_t		t1len;
+			bool		found_escape;
+			const char *subpat;
+			size_t		subpatlen;
+			char	   *buf = NULL;
+
+			/*
+			 * Determine next substring of pattern without wildcards.  p is
+			 * the start of the subpattern, p1 is one past the last byte. Also
+			 * track if we found an escape character.
+			 */
+			p1 = p;
+			p1len = plen;
+			found_escape = false;
+			while (p1len > 0)
+			{
+				if (*p1 == '\\')
+				{
+					found_escape = true;
+					NextByte(p1, p1len);
+					if (p1len == 0)
+						ereport(ERROR,
+								(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
+								 errmsg("LIKE pattern must not end with escape character")));
+				}
+				else if (*p1 == '_' || *p1 == '%')
+					break;
+				NextByte(p1, p1len);
+			}
+
+			/*
+			 * If we found an escape character, then make an unescaped copy of
+			 * the subpattern.
+			 */
+			if (found_escape)
+			{
+				char	   *b;
+
+				b = buf = palloc(p1 - p);
+				for (const char *c = p; c < p1; c++)
+				{
+					if (*c == '\\')
+						;
+					else
+						*(b++) = *c;
+				}
+
+				subpat = buf;
+				subpatlen = b - buf;
+			}
+			else
+			{
+				subpat = p;
+				subpatlen = p1 - p;
+			}
+
+			/*
+			 * Shortcut: If this is the end of the pattern, then the rest of
+			 * the text has to match the rest of the pattern.
+			 */
+			if (p1len == 0)
+			{
+				int			cmp;
+
+				cmp = pg_strncoll(subpat, subpatlen, t, tlen, locale);
+
+				if (buf)
+					pfree(buf);
+				if (cmp == 0)
+					return LIKE_TRUE;
+				else
+					return LIKE_FALSE;
+			}
+
+			/*
+			 * Now build a substring of the text and try to match it against
+			 * the subpattern.  t is the start of the text, t1 is one past the
+			 * last byte.  We start with a zero-length string.
+			 */
+			t1 = t;
+			t1len = tlen;
+			for (;;)
+			{
+				int			cmp;
+
+				CHECK_FOR_INTERRUPTS();
+
+				cmp = pg_strncoll(subpat, subpatlen, t, (t1 - t), locale);
+
+				/*
+				 * If we found a match, we have to test if the rest of pattern
+				 * can match against the rest of the string.  Otherwise we
+				 * have to continue here try matching with a longer substring.
+				 * (This is similar to the recursion for the '%' wildcard
+				 * above.)
+				 *
+				 * Note that we can't just wind forward p and t and continue
+				 * with the main loop.  This would fail for example with
+				 *
+				 * U&'\0061\0308bc' LIKE U&'\00E4_c' COLLATE ignore_accents
+				 *
+				 * You'd find that t=\0061 matches p=\00E4, but then the rest
+				 * won't match; but t=\0061\0308 also matches p=\00E4, and
+				 * then the rest will match.
+				 */
+				if (cmp == 0)
+				{
+					int			matched = MatchText(t1, t1len, p1, p1len, locale);
+
+					if (matched == LIKE_TRUE)
+					{
+						if (buf)
+							pfree(buf);
+						return matched;
+					}
+				}
+
+				/*
+				 * Didn't match.  If we used up the whole text, then the match
+				 * fails.  Otherwise, try again with a longer substring.
+				 */
+				if (t1len == 0)
+					return LIKE_FALSE;
+				else
+					NextChar(t1, t1len);
+			}
+			if (buf)
+				pfree(buf);
+			continue;
+		}
 		else if (GETCHAR(*p, locale) != GETCHAR(*t, locale))
 		{
 			/* non-wildcard pattern char fails to match text char */