aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/tsvector_op.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2016-08-05 15:14:08 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2016-08-05 15:14:19 -0400
commitc50d192ce33c10fa06411306f8644b4f47ce9a06 (patch)
treeeaa0ac8af76c72925d3678170d6df8585f8ad7f0 /src/backend/utils/adt/tsvector_op.c
parent33fe7360afdc0bb1820743ea5bfe3fc7d522f6c4 (diff)
downloadpostgresql-c50d192ce33c10fa06411306f8644b4f47ce9a06.tar.gz
postgresql-c50d192ce33c10fa06411306f8644b4f47ce9a06.zip
Fix ts_delete(tsvector, text[]) to cope with duplicate array entries.
Such cases either failed an Assert, or produced a corrupt tsvector in non-Assert builds, as reported by Andreas Seltenreich. The reason is that tsvector_delete_by_indices() just assumed that its input array had no duplicates. Fix by explicitly de-duping. In passing, improve some comments, and fix a number of tests for null values to use ERRCODE_NULL_VALUE_NOT_ALLOWED not ERRCODE_INVALID_PARAMETER_VALUE. Discussion: <87invhoj6e.fsf@credativ.de>
Diffstat (limited to 'src/backend/utils/adt/tsvector_op.c')
-rw-r--r--src/backend/utils/adt/tsvector_op.c77
1 files changed, 46 insertions, 31 deletions
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 63ae4330ef1..29cc687643c 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -317,7 +317,7 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
if (nulls[i])
ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("lexeme array may not contain nulls")));
lex = VARDATA(dlexemes[i]);
@@ -430,7 +430,7 @@ compareint(const void *va, const void *vb)
/*
* Internal routine to delete lexemes from TSVector by array of offsets.
*
- * int *indices_to_delete -- array of lexeme offsets to delete
+ * int *indices_to_delete -- array of lexeme offsets to delete (modified here!)
* int indices_count -- size of that array
*
* Returns new TSVector without given lexemes along with their positions
@@ -445,35 +445,51 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
*arrout;
char *data = STRPTR(tsv),
*dataout;
- int i,
- j,
- k,
- curoff;
+ int i, /* index in arrin */
+ j, /* index in arrout */
+ k, /* index in indices_to_delete */
+ curoff; /* index in dataout area */
/*
- * Here we overestimates tsout size, since we don't know exact size
- * occupied by positions and weights. We will set exact size later after a
- * pass through TSVector.
+ * Sort the filter array to simplify membership checks below. Also, get
+ * rid of any duplicate entries, so that we can assume that indices_count
+ * is exactly equal to the number of lexemes that will be removed.
*/
- tsout = (TSVector) palloc0(VARSIZE(tsv));
- arrout = ARRPTR(tsout);
- tsout->size = tsv->size - indices_count;
-
- /* Sort our filter array to simplify membership check later. */
if (indices_count > 1)
+ {
+ int kp;
+
qsort(indices_to_delete, indices_count, sizeof(int), compareint);
+ kp = 0;
+ for (k = 1; k < indices_count; k++)
+ {
+ if (indices_to_delete[k] != indices_to_delete[kp])
+ indices_to_delete[++kp] = indices_to_delete[k];
+ }
+ indices_count = ++kp;
+ }
/*
- * Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete.
+ * Here we overestimate tsout size, since we don't know how much space is
+ * used by the deleted lexeme(s). We will set exact size below.
*/
- curoff = 0;
+ tsout = (TSVector) palloc0(VARSIZE(tsv));
+
+ /* This count must be correct because STRPTR(tsout) relies on it. */
+ tsout->size = tsv->size - indices_count;
+
+ /*
+ * Copy tsv to tsout, skipping lexemes listed in indices_to_delete.
+ */
+ arrout = ARRPTR(tsout);
dataout = STRPTR(tsout);
+ curoff = 0;
for (i = j = k = 0; i < tsv->size; i++)
{
/*
- * Here we should check whether current i is present in
- * indices_to_delete or not. Since indices_to_delete is already sorted
- * we can advance it index only when we have match.
+ * If current i is present in indices_to_delete, skip this lexeme.
+ * Since indices_to_delete is already sorted, we only need to check
+ * the current (k'th) entry.
*/
if (k < indices_count && i == indices_to_delete[k])
{
@@ -481,7 +497,7 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
continue;
}
- /* Copy lexeme, it's positions and weights */
+ /* Copy lexeme and its positions and weights */
memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
arrout[j].haspos = arrin[i].haspos;
arrout[j].len = arrin[i].len;
@@ -489,8 +505,8 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
curoff += arrin[i].len;
if (arrin[i].haspos)
{
- int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos) +
- sizeof(uint16);
+ int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos)
+ + sizeof(uint16);
curoff = SHORTALIGN(curoff);
memcpy(dataout + curoff,
@@ -503,10 +519,9 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
}
/*
- * After the pass through TSVector k should equals exactly to
- * indices_count. If it isn't then the caller provided us with indices
- * outside of [0, tsv->size) range and estimation of tsout's size is
- * wrong.
+ * k should now be exactly equal to indices_count. If it isn't then the
+ * caller provided us with indices outside of [0, tsv->size) range and
+ * estimation of tsout's size is wrong.
*/
Assert(k == indices_count);
@@ -560,7 +575,7 @@ tsvector_delete_arr(PG_FUNCTION_ARGS)
/*
* In typical use case array of lexemes to delete is relatively small. So
- * here we optimizing things for that scenario: iterate through lexarr
+ * here we optimize things for that scenario: iterate through lexarr
* performing binary search of each lexeme from lexarr in tsvector.
*/
skip_indices = palloc0(nlex * sizeof(int));
@@ -572,10 +587,10 @@ tsvector_delete_arr(PG_FUNCTION_ARGS)
if (nulls[i])
ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("lexeme array may not contain nulls")));
- lex = VARDATA(dlexemes[i]);
+ lex = VARDATA_ANY(dlexemes[i]);
lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
lex_pos = tsvector_bsearch(tsin, lex, lex_len);
@@ -738,7 +753,7 @@ array_to_tsvector(PG_FUNCTION_ARGS)
{
if (nulls[i])
ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("lexeme array may not contain nulls")));
datalen += VARSIZE_ANY_EXHDR(dlexemes[i]);
@@ -797,7 +812,7 @@ tsvector_filter(PG_FUNCTION_ARGS)
if (nulls[i])
ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
errmsg("weight array may not contain nulls")));
char_weight = DatumGetChar(dweights[i]);