1 files changed, 17 insertions, 1 deletions
diff --git a/src/backend/snowball/dict_snowball.c b/src/backend/snowball/dict_snowball.c
index 68c9213f691..1d5dfff5a02 100644
--- a/src/backend/snowball/dict_snowball.c
+++ b/src/backend/snowball/dict_snowball.c
@@ -275,8 +275,24 @@ dsnowball_lexize(PG_FUNCTION_ARGS)
 	char	   *txt = lowerstr_with_len(in, len);
 	TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
 
-	if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
+	/*
+	 * Do not pass strings exceeding 1000 bytes to the stemmer, as they're
+	 * surely not words in any human language.  This restriction avoids
+	 * wasting cycles on stuff like base64-encoded data, and it protects us
+	 * against possible inefficiency or misbehavior in the stemmer.  (For
+	 * example, the Turkish stemmer has an indefinite recursion, so it can
+	 * crash on long-enough strings.)  However, Snowball dictionaries are
+	 * defined to recognize all strings, so we can't reject the string as an
+	 * unknown word.
+	 */
+	if (len > 1000)
+	{
+		/* return the lexeme lowercased, but otherwise unmodified */
+		res->lexeme = txt;
+	}
+	else if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
 	{
+		/* empty or stopword, so report as stopword */
 		pfree(txt);
 	}
 	else