Cope with more than 64K phrases in a thesaurus dictionary.

dict_thesaurus stored phrase IDs in uint16 fields, so it would get confused and even crash if there were more than 64K entries in the configuration file. It turns out to be basically free to widen the phrase IDs to uint32, so let's just do so. This was complained of some time ago by David Boutin (in bug #7793); he later submitted an informal patch but it was never acted on. We now have another complaint (bug #11901 from Luc Ouellette) so it's time to make something happen. This is basically Boutin's patch, but for future-proofing I also added a defense against too many words per phrase. Note that we don't need any explicit defense against overflow of the uint32 counters, since before that happens we'd hit array allocation sizes that repalloc rejects. Back-patch to all supported branches because of the crash risk.
author: Tom Lane <tgl@sss.pgh.pa.us> 2014-11-06 20:52:40 -0500
committer: Tom Lane <tgl@sss.pgh.pa.us> 2014-11-06 20:52:40 -0500
commit: d6e37b35cda9a88dfd938dd61e9986dd93cc6dd3 (patch)
tree: ddbc928427854eaedaf6fb8efed5f9c6144e701d
parent: 4875931938b27924fe8d6f91bbdb09e2e5a29d0a (diff)
download: postgresql-d6e37b35cda9a88dfd938dd61e9986dd93cc6dd3.tar.gz
postgresql-d6e37b35cda9a88dfd938dd61e9986dd93cc6dd3.zip
1 files changed, 17 insertions, 8 deletions
diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c
index fe4b8f41918..bb8132e98d7 100644
--- a/src/backend/tsearch/dict_thesaurus.c
+++ b/src/backend/tsearch/dict_thesaurus.c
@@ -28,7 +28,7 @@
 
 typedef struct LexemeInfo
 {
-	uint16		idsubst;		/* entry's number in DictThesaurus->subst */
+	uint32		idsubst;		/* entry's number in DictThesaurus->subst */
 	uint16		posinsubst;		/* pos info in entry */
 	uint16		tnvariant;		/* total num lexemes in one variant */
 	struct LexemeInfo *nextentry;
@@ -68,7 +68,7 @@ typedef struct
 
 
 static void
-newLexeme(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst)
+newLexeme(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 posinsubst)
 {
 	TheLexeme  *ptr;
 
@@ -102,7 +102,7 @@ newLexeme(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst)
 }
 
 static void
-addWrd(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
+addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
 {
 	static int	nres = 0;
 	static int	ntres = 0;
@@ -143,7 +143,6 @@ addWrd(DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 p
 			ntres *= 2;
 			ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
 		}
-
 	}
 
 	ptr->res[nres].lexeme = palloc(e - b + 1);
@@ -168,7 +167,7 @@ static void
 thesaurusRead(char *filename, DictThesaurus *d)
 {
 	tsearch_readline_state trst;
-	uint16		idsubst = 0;
+	uint32		idsubst = 0;
 	bool		useasis = false;
 	char	   *line;
 
@@ -184,8 +183,8 @@ thesaurusRead(char *filename, DictThesaurus *d)
 		char	   *ptr;
 		int			state = TR_WAITLEX;
 		char	   *beginwrd = NULL;
-		uint16		posinsubst = 0;
-		uint16		nwrd = 0;
+		uint32		posinsubst = 0;
+		uint32		nwrd = 0;
 
 		ptr = line;
 
@@ -286,6 +285,16 @@ thesaurusRead(char *filename, DictThesaurus *d)
 					(errcode(ERRCODE_CONFIG_FILE_ERROR),
 					 errmsg("unexpected end of line")));
 
+		/*
+		 * Note: currently, tsearch_readline can't return lines exceeding 4KB,
+		 * so overflow of the word counts is impossible.  But that may not
+		 * always be true, so let's check.
+		 */
+		if (nwrd != (uint16) nwrd || posinsubst != (uint16) posinsubst)
+			ereport(ERROR,
+					(errcode(ERRCODE_CONFIG_FILE_ERROR),
+					 errmsg("too many lexemes in thesaurus entry")));
+
 		pfree(line);
 	}
 
@@ -670,7 +679,7 @@ findTheLexeme(DictThesaurus *d, char *lexeme)
 }
 
 static bool
-matchIdSubst(LexemeInfo *stored, uint16 idsubst)
+matchIdSubst(LexemeInfo *stored, uint32 idsubst)
 {
 	bool		res = true;
author	Tom Lane <tgl@sss.pgh.pa.us>	2014-11-06 20:52:40 -0500
committer	Tom Lane <tgl@sss.pgh.pa.us>	2014-11-06 20:52:40 -0500
commit	d6e37b35cda9a88dfd938dd61e9986dd93cc6dd3 (patch)
tree	ddbc928427854eaedaf6fb8efed5f9c6144e701d
parent	4875931938b27924fe8d6f91bbdb09e2e5a29d0a (diff)
download	postgresql-d6e37b35cda9a88dfd938dd61e9986dd93cc6dd3.tar.gz postgresql-d6e37b35cda9a88dfd938dd61e9986dd93cc6dd3.zip