aboutsummaryrefslogtreecommitdiff
path: root/src/backend
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend')
-rw-r--r--src/backend/utils/adt/tsvector_op.c563
1 files changed, 563 insertions, 0 deletions
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index a3f1c36187d..6a01276ca26 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -14,6 +14,7 @@
#include "postgres.h"
+#include "access/htup_details.h"
#include "catalog/namespace.h"
#include "catalog/pg_type.h"
#include "commands/trigger.h"
@@ -65,6 +66,7 @@ typedef struct
#define STATHDRSIZE (offsetof(TSVectorStat, data))
static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
+static int tsvector_bsearch(TSVector tsin, char *lexin, int lexin_len);
/*
* Order: haspos, len, word, for all positions (pos, weight)
@@ -251,6 +253,90 @@ tsvector_setweight(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(out);
}
+/*
+ * setweight(tsin tsvector, char_weight "char", lexemes "text"[])
+ *
+ * Assign weight w to elements of tsin that are listed in lexemes.
+ */
+Datum
+tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ char char_weight = PG_GETARG_CHAR(1);
+ ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(2);
+
+ TSVector tsout;
+ int i,
+ j,
+ nlexemes,
+ weight;
+ WordEntry *entry;
+ Datum *dlexemes;
+ bool *nulls;
+
+ switch (char_weight)
+ {
+ case 'A': case 'a':
+ weight = 3;
+ break;
+ case 'B': case 'b':
+ weight = 2;
+ break;
+ case 'C': case 'c':
+ weight = 1;
+ break;
+ case 'D': case 'd':
+ weight = 0;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %c", char_weight);
+ }
+
+ tsout = (TSVector) palloc(VARSIZE(tsin));
+ memcpy(tsout, tsin, VARSIZE(tsin));
+ entry = ARRPTR(tsout);
+
+ deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
+ &dlexemes, &nulls, &nlexemes);
+
+ /*
+ * Assuming that lexemes array is significantly shorter than tsvector
+ * we can iterate through lexemes performing binary search
+ * of each lexeme from lexemes in tsvector.
+ */
+ for (i = 0; i < nlexemes; i++)
+ {
+ char *lex;
+ int lex_len,
+ lex_pos;
+
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("lexeme array may not contain nulls")));
+
+ lex = VARDATA(dlexemes[i]);
+ lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+ lex_pos = tsvector_bsearch(tsout, lex, lex_len);
+
+ if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0)
+ {
+ WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos);
+ while (j--)
+ {
+ WEP_SETWEIGHT(*p, weight);
+ p++;
+ }
+ }
+ }
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(lexemes, 2);
+
+ PG_RETURN_POINTER(tsout);
+}
+
#define compareEntry(pa, a, pb, b) \
tsCompareString((pa) + (a)->pos, (a)->len, \
(pb) + (b)->pos, (b)->len, \
@@ -291,6 +377,483 @@ add_pos(TSVector src, WordEntry *srcptr,
return *clen - startlen;
}
+/*
+ * Perform binary search of given lexeme in TSVector.
+ * Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't
+ * found.
+ */
+static int
+tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
+{
+ WordEntry *arrin = ARRPTR(tsv);
+ int StopLow = 0,
+ StopHigh = tsv->size,
+ StopMiddle,
+ cmp;
+
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = (StopLow + StopHigh)/2;
+
+ cmp = tsCompareString(lexeme, lexeme_len,
+ STRPTR(tsv) + arrin[StopMiddle].pos,
+ arrin[StopMiddle].len,
+ false);
+
+ if (cmp < 0)
+ StopHigh = StopMiddle;
+ else if (cmp > 0)
+ StopLow = StopMiddle + 1;
+ else /* found it */
+ return StopMiddle;
+ }
+
+ return -1;
+}
+
+static int
+compareint(const void *va, const void *vb)
+{
+ int32 a = *((const int32 *) va);
+ int32 b = *((const int32 *) vb);
+
+ if (a == b)
+ return 0;
+ return (a > b) ? 1 : -1;
+}
+
+/*
+ * Internal routine to delete lexemes from TSVector by array of offsets.
+ *
+ * int *indices_to_delete -- array of lexeme offsets to delete
+ * int indices_count -- size of that array
+ *
+ * Returns new TSVector without given lexemes along with their positions
+ * and weights.
+ */
+static TSVector
+tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
+ int indices_count)
+{
+ TSVector tsout;
+ WordEntry *arrin = ARRPTR(tsv),
+ *arrout;
+ char *data = STRPTR(tsv),
+ *dataout;
+ int i, j, k,
+ curoff;
+
+ /*
+ * Here we overestimates tsout size, since we don't know exact size
+ * occupied by positions and weights. We will set exact size later
+ * after a pass through TSVector.
+ */
+ tsout = (TSVector) palloc0(VARSIZE(tsv));
+ arrout = ARRPTR(tsout);
+ tsout->size = tsv->size - indices_count;
+
+ /* Sort our filter array to simplify membership check later. */
+ if (indices_count > 1)
+ qsort(indices_to_delete, indices_count, sizeof(int), compareint);
+
+ /*
+ * Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete.
+ */
+ curoff = 0;
+ dataout = STRPTR(tsout);
+ for (i = j = k = 0; i < tsv->size; i++)
+ {
+ /*
+ * Here we should check whether current i is present in
+ * indices_to_delete or not. Since indices_to_delete is already
+ * sorted we can advance it index only when we have match.
+ */
+ if (k < indices_count && i == indices_to_delete[k]){
+ k++;
+ continue;
+ }
+
+ /* Copy lexeme, it's positions and weights */
+ memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
+ arrout[j].haspos = arrin[i].haspos;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = curoff;
+ curoff += arrin[i].len;
+ if (arrin[i].haspos)
+ {
+ int len = POSDATALEN(tsv, arrin+i) * sizeof(WordEntryPos) +
+ sizeof(uint16);
+ curoff = SHORTALIGN(curoff);
+ memcpy(dataout + curoff,
+ STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len),
+ len);
+ curoff += len;
+ }
+
+ j++;
+ }
+
+ /*
+ * After the pass through TSVector k should equals exactly to indices_count.
+ * If it isn't then the caller provided us with indices outside of
+ * [0, tsv->size) range and estimation of tsout's size is wrong.
+ */
+ Assert(k == indices_count);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
+ return tsout;
+}
+
+/*
+ * Delete given lexeme from tsvector.
+ * Implementation of user-level delete(tsvector, text).
+ */
+Datum
+tsvector_delete_str(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ text *tlexeme = PG_GETARG_TEXT_P(1);
+ char *lexeme = VARDATA(tlexeme);
+ int lexeme_len = VARSIZE_ANY_EXHDR(tlexeme),
+ skip_index;
+
+ if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1)
+ PG_RETURN_POINTER(tsin);
+
+ tsout = tsvector_delete_by_indices(tsin, &skip_index, 1);
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(tlexeme, 1);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Delete given array of lexemes from tsvector.
+ * Implementation of user-level delete(tsvector, text[]).
+ */
+Datum
+tsvector_delete_arr(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1);
+ int i, nlex,
+ skip_count,
+ *skip_indices;
+ Datum *dlexemes;
+ bool *nulls;
+
+ deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
+ &dlexemes, &nulls, &nlex);
+
+ /*
+ * In typical use case array of lexemes to delete is relatively small.
+ * So here we optimizing things for that scenario: iterate through lexarr
+ * performing binary search of each lexeme from lexarr in tsvector.
+ */
+ skip_indices = palloc0(nlex * sizeof(int));
+ for (i = skip_count = 0; i < nlex; i++)
+ {
+ char *lex;
+ int lex_len,
+ lex_pos;
+
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("lexeme array may not contain nulls")));
+
+ lex = VARDATA(dlexemes[i]);
+ lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+ lex_pos = tsvector_bsearch(tsin, lex, lex_len);
+
+ if (lex_pos >= 0)
+ skip_indices[skip_count++] = lex_pos;
+ }
+
+ tsout = tsvector_delete_by_indices(tsin, skip_indices, skip_count);
+
+ pfree(skip_indices);
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_FREE_IF_COPY(lexemes, 1);
+
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Expand tsvector as table with following columns:
+ * lexeme: lexeme text
+ * positions: integer array of lexeme positions
+ * weights: char array of weights corresponding to positions
+ */
+Datum
+tsvector_unnest(PG_FUNCTION_ARGS)
+{
+ FuncCallContext *funcctx;
+ TSVector tsin;
+
+ if (SRF_IS_FIRSTCALL())
+ {
+ MemoryContext oldcontext;
+ TupleDesc tupdesc;
+
+ funcctx = SRF_FIRSTCALL_INIT();
+ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+ tupdesc = CreateTemplateTupleDesc(3, false);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme",
+ TEXTOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions",
+ INT2ARRAYOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights",
+ TEXTARRAYOID, -1, 0);
+ funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+ funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0);
+
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ funcctx = SRF_PERCALL_SETUP();
+ tsin = (TSVector) funcctx->user_fctx;
+
+ if (funcctx->call_cntr < tsin->size)
+ {
+ WordEntry *arrin = ARRPTR(tsin);
+ char *data = STRPTR(tsin);
+ HeapTuple tuple;
+ int j,
+ i = funcctx->call_cntr;
+ bool nulls[] = {false, false, false};
+ Datum values[3];
+
+ values[0] = PointerGetDatum(
+ cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len)
+ );
+
+ if (arrin[i].haspos)
+ {
+ WordEntryPosVector *posv;
+ Datum *positions;
+ Datum *weights;
+ char weight;
+
+ /*
+ * Internally tsvector stores position and weight in the same
+ * uint16 (2 bits for weight, 14 for position). Here we extract that
+ * in two separate arrays.
+ */
+ posv = _POSVECPTR(tsin, arrin + i);
+ positions = palloc(posv->npos * sizeof(Datum));
+ weights = palloc(posv->npos * sizeof(Datum));
+ for (j = 0; j < posv->npos; j++)
+ {
+ positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
+ weight = 'D' - WEP_GETWEIGHT(posv->pos[j]);
+ weights[j] = PointerGetDatum(
+ cstring_to_text_with_len(&weight, 1)
+ );
+ }
+
+ values[1] = PointerGetDatum(
+ construct_array(positions, posv->npos, INT2OID, 2, true, 's'));
+ values[2] = PointerGetDatum(
+ construct_array(weights, posv->npos, TEXTOID, -1, false, 'i'));
+ }
+ else
+ {
+ nulls[1] = nulls[2] = true;
+ }
+
+ tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+ SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+ }
+ else
+ {
+ pfree(tsin);
+ SRF_RETURN_DONE(funcctx);
+ }
+}
+
+/*
+ * Convert tsvector to array of lexemes.
+ */
+Datum
+tsvector_to_array(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0);
+ WordEntry *arrin = ARRPTR(tsin);
+ Datum elements[tsin->size];
+ int i;
+ ArrayType *array;
+
+ for (i = 0; i < tsin->size; i++)
+ {
+ elements[i] = PointerGetDatum(
+ cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len)
+ );
+ }
+
+ array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i');
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(array);
+}
+
+/*
+ * Build tsvector from array of lexemes.
+ */
+Datum
+array_to_tsvector(PG_FUNCTION_ARGS)
+{
+ ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
+ TSVector tsout;
+ Datum *dlexemes;
+ WordEntry *arrout;
+ bool *nulls;
+ int nitems,
+ i,
+ tslen,
+ datalen = 0;
+ char *cur;
+
+ deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems);
+
+ for (i = 0; i < nitems; i++)
+ {
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("lexeme array may not contain nulls")));
+
+ datalen += VARSIZE_ANY_EXHDR(dlexemes[i]);
+ }
+
+ tslen = CALCDATASIZE(nitems, datalen);
+ tsout = (TSVector) palloc0(tslen);
+ SET_VARSIZE(tsout, tslen);
+ tsout->size = nitems;
+ arrout = ARRPTR(tsout);
+ cur = STRPTR(tsout);
+
+ for (i = 0; i < nitems; i++)
+ {
+ char *lex = VARDATA(dlexemes[i]);
+ int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
+
+ memcpy(cur, lex, lex_len);
+ arrout[i].haspos = 0;
+ arrout[i].len = lex_len;
+ arrout[i].pos = cur - STRPTR(tsout);
+ cur += lex_len;
+ }
+
+ PG_FREE_IF_COPY(v, 0);
+ PG_RETURN_POINTER(tsout);
+}
+
+/*
+ * Leave only elements with given weights from tsvector.
+ */
+Datum
+tsvector_filter(PG_FUNCTION_ARGS)
+{
+ TSVector tsin = PG_GETARG_TSVECTOR(0),
+ tsout;
+ ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1);
+ WordEntry *arrin = ARRPTR(tsin),
+ *arrout;
+ char *datain = STRPTR(tsin),
+ *dataout;
+ Datum *dweights;
+ bool *nulls;
+ int nweigths;
+ int i, j;
+ char mask = 0,
+ cur_pos = 0;
+
+ deconstruct_array(weights, CHAROID, 1, true, 'c',
+ &dweights, &nulls, &nweigths);
+
+ for (i = 0; i < nweigths; i++)
+ {
+ char char_weight;
+
+ if (nulls[i])
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("weight array may not contain nulls")));
+
+ char_weight = DatumGetChar(dweights[i]);
+ switch (char_weight)
+ {
+ case 'A': case 'a':
+ mask = mask | 8;
+ break;
+ case 'B': case 'b':
+ mask = mask | 4;
+ break;
+ case 'C': case 'c':
+ mask = mask | 2;
+ break;
+ case 'D': case 'd':
+ mask = mask | 1;
+ break;
+ default:
+ /* internal error */
+ elog(ERROR, "unrecognized weight: %c", char_weight);
+ }
+ }
+
+ tsout = (TSVector) palloc0(VARSIZE(tsin));
+ tsout->size = tsin->size;
+ arrout = ARRPTR(tsout);
+ dataout = STRPTR(tsout);
+
+ for (i = j = 0; i < tsin->size; i++)
+ {
+ WordEntryPosVector *posvin,
+ *posvout;
+ int npos = 0;
+ int k;
+
+ if (!arrin[i].haspos)
+ continue;
+
+ posvin = _POSVECPTR(tsin, arrin + i);
+ posvout = (WordEntryPosVector *)
+ (dataout + SHORTALIGN(cur_pos + arrin[i].len));
+
+ for (k = 0; k < posvin->npos; k++)
+ {
+ if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
+ posvout->pos[npos++] = posvin->pos[k];
+ }
+
+ if (!npos) /* no satisfactory positions found, so skip that lexeme */
+ continue;
+
+ arrout[j].haspos = true;
+ arrout[j].len = arrin[i].len;
+ arrout[j].pos = cur_pos;
+
+ memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
+ posvout->npos = npos;
+ cur_pos += SHORTALIGN(arrin[i].len);
+ cur_pos += POSDATALEN(tsout, arrout+j) * sizeof(WordEntryPos) +
+ sizeof(uint16);
+ j++;
+ }
+
+ tsout->size = j;
+ if (dataout != STRPTR(tsout))
+ memmove(STRPTR(tsout), dataout, cur_pos);
+
+ SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
+
+ PG_FREE_IF_COPY(tsin, 0);
+ PG_RETURN_POINTER(tsout);
+}
Datum
tsvector_concat(PG_FUNCTION_ARGS)