aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/snowball/dict_snowball.c18
1 files changed, 17 insertions, 1 deletions
diff --git a/src/backend/snowball/dict_snowball.c b/src/backend/snowball/dict_snowball.c
index 68c9213f691..1d5dfff5a02 100644
--- a/src/backend/snowball/dict_snowball.c
+++ b/src/backend/snowball/dict_snowball.c
@@ -275,8 +275,24 @@ dsnowball_lexize(PG_FUNCTION_ARGS)
char *txt = lowerstr_with_len(in, len);
TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
- if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
+ /*
+ * Do not pass strings exceeding 1000 bytes to the stemmer, as they're
+ * surely not words in any human language. This restriction avoids
+ * wasting cycles on stuff like base64-encoded data, and it protects us
+ * against possible inefficiency or misbehavior in the stemmer. (For
+ * example, the Turkish stemmer has an indefinite recursion, so it can
+ * crash on long-enough strings.) However, Snowball dictionaries are
+ * defined to recognize all strings, so we can't reject the string as an
+ * unknown word.
+ */
+ if (len > 1000)
+ {
+ /* return the lexeme lowercased, but otherwise unmodified */
+ res->lexeme = txt;
+ }
+ else if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
{
+ /* empty or stopword, so report as stopword */
pfree(txt);
}
else