Refactoring by Heikki Linnakangas <heikki@enterprisedb.com> with

small editorization by me - Brake the QueryItem struct into QueryOperator and QueryOperand. Type was really the only common field between them. QueryItem still exists, and is used in the TSQuery struct as before, but it's now a union of the two. Many other changes fell from that, like separation of pushval_asis function into pushValue, pushOperator and pushStop. - Moved some structs that were for internal use only from header files to the right .c-files. - Moved tsvector parser to a new tsvector_parser.c file. Parser code was about half of the size of tsvector.c, it's also used from tsquery.c, and it has some data structures of its own, so it seems better to separate it. Cleaned up the API so that TSVectorParserState is not accessed from outside tsvector_parser.c. - Separated enumerations (#defines, really) used for QueryItem.type field and as return codes from gettoken_query. It was just accidental code sharing. - Removed ParseQueryNode struct used internally by makepol and friends. push*-functions now construct QueryItems directly. - Changed int4 variables to just ints for variables like "i" or "array size", where the storage-size was not significant.
author: Teodor Sigaev <teodor@sigaev.ru> 2007-09-07 15:09:56 +0000
committer: Teodor Sigaev <teodor@sigaev.ru> 2007-09-07 15:09:56 +0000
commit: e5be89981fc70648eedb325781cf2fbd4da05ba8 (patch)
tree: 8a2c587b7ae857fcbe2c2ed4ff21772c11d935a3 /src/backend/utils/adt/tsvector.c
parent: da1248401d00bce0cb75fa3ab278f906f407bec9 (diff)
download: postgresql-e5be89981fc70648eedb325781cf2fbd4da05ba8.tar.gz
postgresql-e5be89981fc70648eedb325781cf2fbd4da05ba8.zip
1 files changed, 108 insertions, 306 deletions
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c
index 8ab024650f7..2866e028da0 100644
--- a/src/backend/utils/adt/tsvector.c
+++ b/src/backend/utils/adt/tsvector.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.2 2007/08/21 01:45:33 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.3 2007/09/07 15:09:56 teodor Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -20,22 +20,37 @@
 #include "tsearch/ts_utils.h"
 #include "utils/memutils.h"
 
+typedef struct
+{
+	WordEntry	entry;			/* should be first ! */
+	WordEntryPos *pos;
+	int			poslen;			/* number of elements in pos */
+} WordEntryIN;
 
 static int
 comparePos(const void *a, const void *b)
 {
-	if (WEP_GETPOS(*(WordEntryPos *) a) == WEP_GETPOS(*(WordEntryPos *) b))
+	int apos = WEP_GETPOS(*(WordEntryPos *) a);
+	int bpos = WEP_GETPOS(*(WordEntryPos *) b);
+
+	if (apos == bpos)
 		return 0;
-	return (WEP_GETPOS(*(WordEntryPos *) a) > WEP_GETPOS(*(WordEntryPos *) b)) ? 1 : -1;
+	return (apos > bpos) ? 1 : -1;
 }
 
+/*
+ * Removes duplicate pos entries. If there's two entries with same pos
+ * but different weight, the higher weight is retained.
+ *
+ * Returns new length.
+ */
 static int
-uniquePos(WordEntryPos * a, int4 l)
+uniquePos(WordEntryPos * a, int l)
 {
 	WordEntryPos *ptr,
 			   *res;
 
-	if (l == 1)
+	if (l <= 1)
 		return l;
 
 	res = a;
@@ -75,21 +90,23 @@ compareentry(const void *a, const void *b, void *arg)
 }
 
 static int
-uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen)
+uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen)
 {
 	WordEntryIN *ptr,
 			   *res;
 
-	res = a;
+	Assert(l >= 1);
+
 	if (l == 1)
 	{
 		if (a->entry.haspos)
 		{
-			*(uint16 *) (a->pos) = uniquePos(&(a->pos[1]), *(uint16 *) (a->pos));
-			*outbuflen = SHORTALIGN(res->entry.len) + (*(uint16 *) (a->pos) + 1) * sizeof(WordEntryPos);
+			a->poslen = uniquePos(a->pos, a->poslen);
+			*outbuflen = SHORTALIGN(a->entry.len) + (a->poslen + 1) * sizeof(WordEntryPos);
 		}
 		return l;
 	}
+	res = a;
 
 	ptr = a + 1;
 	qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, (void *) buf);
@@ -101,8 +118,8 @@ uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen)
 		{
 			if (res->entry.haspos)
 			{
-				*(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos));
-				*outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos);
+				res->poslen = uniquePos(res->pos, res->poslen);
+				*outbuflen += res->poslen * sizeof(WordEntryPos);
 			}
 			*outbuflen += SHORTALIGN(res->entry.len);
 			res++;
@@ -112,12 +129,14 @@ uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen)
 		{
 			if (res->entry.haspos)
 			{
-				int4		len = *(uint16 *) (ptr->pos) + 1 + *(uint16 *) (res->pos);
+				int	newlen = ptr->poslen + res->poslen;
+
+				/* Append res to pos */
 
-				res->pos = (WordEntryPos *) repalloc(res->pos, len * sizeof(WordEntryPos));
-				memcpy(&(res->pos[*(uint16 *) (res->pos) + 1]),
-					   &(ptr->pos[1]), *(uint16 *) (ptr->pos) * sizeof(WordEntryPos));
-				*(uint16 *) (res->pos) += *(uint16 *) (ptr->pos);
+				res->pos = (WordEntryPos *) repalloc(res->pos, newlen * sizeof(WordEntryPos));
+				memcpy(&res->pos[res->poslen],
+					   ptr->pos, ptr->poslen * sizeof(WordEntryPos));
+				res->poslen = newlen;
 				pfree(ptr->pos);
 			}
 			else
@@ -130,8 +149,8 @@ uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen)
 	}
 	if (res->entry.haspos)
 	{
-		*(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos));
-		*outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos);
+		res->poslen = uniquePos(res->pos, res->poslen);
+		*outbuflen += res->poslen * sizeof(WordEntryPos);
 	}
 	*outbuflen += SHORTALIGN(res->entry.len);
 
@@ -144,248 +163,6 @@ WordEntryCMP(WordEntry * a, WordEntry * b, char *buf)
 	return compareentry(a, b, buf);
 }
 
-#define WAITWORD		1
-#define WAITENDWORD		2
-#define WAITNEXTCHAR	3
-#define WAITENDCMPLX	4
-#define WAITPOSINFO		5
-#define INPOSINFO		6
-#define WAITPOSDELIM	7
-#define WAITCHARCMPLX	8
-
-#define RESIZEPRSBUF \
-do { \
-	if ( state->curpos - state->word + pg_database_encoding_max_length() >= state->len ) \
-	{ \
-		int4 clen = state->curpos - state->word; \
-		state->len *= 2; \
-		state->word = (char*)repalloc( (void*)state->word, state->len ); \
-		state->curpos = state->word + clen; \
-	} \
-} while (0)
-
-bool
-gettoken_tsvector(TSVectorParseState *state)
-{
-	int4		oldstate = 0;
-
-	state->curpos = state->word;
-	state->state = WAITWORD;
-	state->alen = 0;
-
-	while (1)
-	{
-		if (state->state == WAITWORD)
-		{
-			if (*(state->prsbuf) == '\0')
-				return false;
-			else if (t_iseq(state->prsbuf, '\''))
-				state->state = WAITENDCMPLX;
-			else if (t_iseq(state->prsbuf, '\\'))
-			{
-				state->state = WAITNEXTCHAR;
-				oldstate = WAITENDWORD;
-			}
-			else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
-				ereport(ERROR,
-						(errcode(ERRCODE_SYNTAX_ERROR),
-						 errmsg("syntax error in tsvector")));
-			else if (!t_isspace(state->prsbuf))
-			{
-				COPYCHAR(state->curpos, state->prsbuf);
-				state->curpos += pg_mblen(state->prsbuf);
-				state->state = WAITENDWORD;
-			}
-		}
-		else if (state->state == WAITNEXTCHAR)
-		{
-			if (*(state->prsbuf) == '\0')
-				ereport(ERROR,
-						(errcode(ERRCODE_SYNTAX_ERROR),
-						 errmsg("there is no escaped character")));
-			else
-			{
-				RESIZEPRSBUF;
-				COPYCHAR(state->curpos, state->prsbuf);
-				state->curpos += pg_mblen(state->prsbuf);
-				state->state = oldstate;
-			}
-		}
-		else if (state->state == WAITENDWORD)
-		{
-			if (t_iseq(state->prsbuf, '\\'))
-			{
-				state->state = WAITNEXTCHAR;
-				oldstate = WAITENDWORD;
-			}
-			else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
-					 (state->oprisdelim && ISOPERATOR(state->prsbuf)))
-			{
-				RESIZEPRSBUF;
-				if (state->curpos == state->word)
-					ereport(ERROR,
-							(errcode(ERRCODE_SYNTAX_ERROR),
-							 errmsg("syntax error in tsvector")));
-				*(state->curpos) = '\0';
-				return true;
-			}
-			else if (t_iseq(state->prsbuf, ':'))
-			{
-				if (state->curpos == state->word)
-					ereport(ERROR,
-							(errcode(ERRCODE_SYNTAX_ERROR),
-							 errmsg("syntax error in tsvector")));
-				*(state->curpos) = '\0';
-				if (state->oprisdelim)
-					return true;
-				else
-					state->state = INPOSINFO;
-			}
-			else
-			{
-				RESIZEPRSBUF;
-				COPYCHAR(state->curpos, state->prsbuf);
-				state->curpos += pg_mblen(state->prsbuf);
-			}
-		}
-		else if (state->state == WAITENDCMPLX)
-		{
-			if (t_iseq(state->prsbuf, '\''))
-			{
-				state->state = WAITCHARCMPLX;
-			}
-			else if (t_iseq(state->prsbuf, '\\'))
-			{
-				state->state = WAITNEXTCHAR;
-				oldstate = WAITENDCMPLX;
-			}
-			else if (*(state->prsbuf) == '\0')
-				ereport(ERROR,
-						(errcode(ERRCODE_SYNTAX_ERROR),
-						 errmsg("syntax error in tsvector")));
-			else
-			{
-				RESIZEPRSBUF;
-				COPYCHAR(state->curpos, state->prsbuf);
-				state->curpos += pg_mblen(state->prsbuf);
-			}
-		}
-		else if (state->state == WAITCHARCMPLX)
-		{
-			if (t_iseq(state->prsbuf, '\''))
-			{
-				RESIZEPRSBUF;
-				COPYCHAR(state->curpos, state->prsbuf);
-				state->curpos += pg_mblen(state->prsbuf);
-				state->state = WAITENDCMPLX;
-			}
-			else
-			{
-				RESIZEPRSBUF;
-				*(state->curpos) = '\0';
-				if (state->curpos == state->word)
-					ereport(ERROR,
-							(errcode(ERRCODE_SYNTAX_ERROR),
-							 errmsg("syntax error in tsvector")));
-				if (state->oprisdelim)
-				{
-					/* state->prsbuf+=pg_mblen(state->prsbuf); */
-					return true;
-				}
-				else
-					state->state = WAITPOSINFO;
-				continue;		/* recheck current character */
-			}
-		}
-		else if (state->state == WAITPOSINFO)
-		{
-			if (t_iseq(state->prsbuf, ':'))
-				state->state = INPOSINFO;
-			else
-				return true;
-		}
-		else if (state->state == INPOSINFO)
-		{
-			if (t_isdigit(state->prsbuf))
-			{
-				if (state->alen == 0)
-				{
-					state->alen = 4;
-					state->pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * state->alen);
-					*(uint16 *) (state->pos) = 0;
-				}
-				else if (*(uint16 *) (state->pos) + 1 >= state->alen)
-				{
-					state->alen *= 2;
-					state->pos = (WordEntryPos *) repalloc(state->pos, sizeof(WordEntryPos) * state->alen);
-				}
-				(*(uint16 *) (state->pos))++;
-				WEP_SETPOS(state->pos[*(uint16 *) (state->pos)], LIMITPOS(atoi(state->prsbuf)));
-				if (WEP_GETPOS(state->pos[*(uint16 *) (state->pos)]) == 0)
-					ereport(ERROR,
-							(errcode(ERRCODE_SYNTAX_ERROR),
-							 errmsg("wrong position info in tsvector")));
-				WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0);
-				state->state = WAITPOSDELIM;
-			}
-			else
-				ereport(ERROR,
-						(errcode(ERRCODE_SYNTAX_ERROR),
-						 errmsg("syntax error in tsvector")));
-		}
-		else if (state->state == WAITPOSDELIM)
-		{
-			if (t_iseq(state->prsbuf, ','))
-				state->state = INPOSINFO;
-			else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
-			{
-				if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
-					ereport(ERROR,
-							(errcode(ERRCODE_SYNTAX_ERROR),
-							 errmsg("syntax error in tsvector")));
-				WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 3);
-			}
-			else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
-			{
-				if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
-					ereport(ERROR,
-							(errcode(ERRCODE_SYNTAX_ERROR),
-							 errmsg("syntax error in tsvector")));
-				WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 2);
-			}
-			else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
-			{
-				if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
-					ereport(ERROR,
-							(errcode(ERRCODE_SYNTAX_ERROR),
-							 errmsg("syntax error in tsvector")));
-				WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 1);
-			}
-			else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
-			{
-				if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
-					ereport(ERROR,
-							(errcode(ERRCODE_SYNTAX_ERROR),
-							 errmsg("syntax error in tsvector")));
-				WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0);
-			}
-			else if (t_isspace(state->prsbuf) ||
-					 *(state->prsbuf) == '\0')
-				return true;
-			else if (!t_isdigit(state->prsbuf))
-				ereport(ERROR,
-						(errcode(ERRCODE_SYNTAX_ERROR),
-						 errmsg("syntax error in tsvector")));
-		}
-		else					/* internal error */
-			elog(ERROR, "internal error in gettoken_tsvector");
-
-		/* get next char */
-		state->prsbuf += pg_mblen(state->prsbuf);
-	}
-
-	return false;
-}
 
 Datum
 tsvectorin(PG_FUNCTION_ARGS)
@@ -393,70 +170,82 @@ tsvectorin(PG_FUNCTION_ARGS)
 	char	   *buf = PG_GETARG_CSTRING(0);
 	TSVectorParseState state;
 	WordEntryIN *arr;
+	int			totallen;
+	int			arrlen;  /* allocated size of arr */
 	WordEntry  *inarr;
-	int4		len = 0,
-				totallen = 64;
+	int			len = 0;
 	TSVector	in;
-	char	   *tmpbuf,
-			   *cur;
-	int4		i,
-				buflen = 256;
+	int			i;
+	char	   *token;
+	int			toklen;
+	WordEntryPos *pos;
+	int			poslen;
+
+	/*
+	 * Tokens are appended to tmpbuf, cur is a pointer
+	 * to the end of used space in tmpbuf.
+	 */
+	char	   *tmpbuf;
+	char	   *cur;
+	int			buflen = 256; /* allocated size of tmpbuf */
 
 	pg_verifymbstr(buf, strlen(buf), false);
-	state.prsbuf = buf;
-	state.len = 32;
-	state.word = (char *) palloc(state.len);
-	state.oprisdelim = false;
 
-	arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * totallen);
+	state = init_tsvector_parser(buf, false);
+	
+	arrlen = 64;
+	arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
 	cur = tmpbuf = (char *) palloc(buflen);
 
-	while (gettoken_tsvector(&state))
+	while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
 	{
-		/*
-		 * Realloc buffers if it's needed
-		 */
-		if (len >= totallen)
-		{
-			totallen *= 2;
-			arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * totallen);
-		}
-
-		while ((cur - tmpbuf) + (state.curpos - state.word) >= buflen)
-		{
-			int4		dist = cur - tmpbuf;
-
-			buflen *= 2;
-			tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
-			cur = tmpbuf + dist;
-		}
 
-		if (state.curpos - state.word >= MAXSTRLEN)
+		if (toklen >= MAXSTRLEN)
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
 					 errmsg("word is too long (%ld bytes, max %ld bytes)",
-							(long) (state.curpos - state.word),
+							(long) toklen,
 							(long) MAXSTRLEN)));
 
-		arr[len].entry.len = state.curpos - state.word;
+
 		if (cur - tmpbuf > MAXSTRPOS)
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
 					 errmsg("position value too large")));
+
+		/*
+		 * Enlarge buffers if needed
+		 */
+		if (len >= arrlen)
+		{
+			arrlen *= 2;
+			arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
+		}
+		while ((cur - tmpbuf) + toklen >= buflen)
+		{
+			int	dist = cur - tmpbuf;
+
+			buflen *= 2;
+			tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
+			cur = tmpbuf + dist;
+		}
+		arr[len].entry.len = toklen;
 		arr[len].entry.pos = cur - tmpbuf;
-		memcpy((void *) cur, (void *) state.word, arr[len].entry.len);
-		cur += arr[len].entry.len;
+		memcpy((void *) cur, (void *) token, toklen);
+		cur += toklen;
 
-		if (state.alen)
+		if (poslen != 0)
 		{
 			arr[len].entry.haspos = 1;
-			arr[len].pos = state.pos;
+			arr[len].pos = pos;
+			arr[len].poslen = poslen;
 		}
 		else
 			arr[len].entry.haspos = 0;
 		len++;
 	}
-	pfree(state.word);
+
+	close_tsvector_parser(state);
 
 	if (len > 0)
 		len = uniqueentry(arr, len, tmpbuf, &buflen);
@@ -476,8 +265,21 @@ tsvectorin(PG_FUNCTION_ARGS)
 		cur += SHORTALIGN(arr[i].entry.len);
 		if (arr[i].entry.haspos)
 		{
-			memcpy(cur, arr[i].pos, (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos));
-			cur += (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos);
+			uint16 tmplen;
+
+			if(arr[i].poslen > 0xFFFF)
+				elog(ERROR, "positions array too long");
+
+			tmplen = (uint16) arr[i].poslen;
+
+			/* Copy length to output struct */
+			memcpy(cur, &tmplen, sizeof(uint16));
+			cur += sizeof(uint16);
+
+			/* Copy positions */
+			memcpy(cur, arr[i].pos, (arr[i].poslen) * sizeof(WordEntryPos));
+			cur += arr[i].poslen * sizeof(WordEntryPos);
+
 			pfree(arr[i].pos);
 		}
 		inarr[i] = arr[i].entry;
@@ -604,26 +406,26 @@ tsvectorrecv(PG_FUNCTION_ARGS)
 {
 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
 	TSVector	vec;
-	int			i,
-				size,
-				len = DATAHDRSIZE;
+	int			i;
+	uint32		size;
 	WordEntry  *weptr;
 	int			datalen = 0;
+	Size		len;
 
 	size = pq_getmsgint(buf, sizeof(uint32));
 	if (size < 0 || size > (MaxAllocSize / sizeof(WordEntry)))
 		elog(ERROR, "invalid size of tsvector");
 
-	len += sizeof(WordEntry) * size;
+	len = DATAHDRSIZE + sizeof(WordEntry) * size;
 
-	len *= 2;
+	len = len * 2; /* times two to make room for lexemes */
 	vec = (TSVector) palloc0(len);
 	vec->size = size;
 
 	weptr = ARRPTR(vec);
 	for (i = 0; i < size; i++)
 	{
-		int			tmp;
+		int32 tmp;
 
 		weptr = ARRPTR(vec) + i;
 
@@ -654,7 +456,7 @@ tsvectorrecv(PG_FUNCTION_ARGS)
 						npos;
 			WordEntryPos *wepptr;
 
-			npos = (uint16) pq_getmsgint(buf, sizeof(int16));
+			npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
 			if (npos > MAXNUMPOS)
 				elog(ERROR, "unexpected number of positions");
author	Teodor Sigaev <teodor@sigaev.ru>	2007-09-07 15:09:56 +0000
committer	Teodor Sigaev <teodor@sigaev.ru>	2007-09-07 15:09:56 +0000
commit	e5be89981fc70648eedb325781cf2fbd4da05ba8 (patch)
tree	8a2c587b7ae857fcbe2c2ed4ff21772c11d935a3 /src/backend/utils/adt/tsvector.c
parent	da1248401d00bce0cb75fa3ab278f906f407bec9 (diff)
download	postgresql-e5be89981fc70648eedb325781cf2fbd4da05ba8.tar.gz postgresql-e5be89981fc70648eedb325781cf2fbd4da05ba8.zip