diff options
Diffstat (limited to 'src/backend')
-rw-r--r-- | src/backend/utils/adt/tsvector_op.c | 563 |
1 files changed, 563 insertions, 0 deletions
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index a3f1c36187d..6a01276ca26 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -14,6 +14,7 @@ #include "postgres.h" +#include "access/htup_details.h" #include "catalog/namespace.h" #include "catalog/pg_type.h" #include "commands/trigger.h" @@ -65,6 +66,7 @@ typedef struct #define STATHDRSIZE (offsetof(TSVectorStat, data)) static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column); +static int tsvector_bsearch(TSVector tsin, char *lexin, int lexin_len); /* * Order: haspos, len, word, for all positions (pos, weight) @@ -251,6 +253,90 @@ tsvector_setweight(PG_FUNCTION_ARGS) PG_RETURN_POINTER(out); } +/* + * setweight(tsin tsvector, char_weight "char", lexemes "text"[]) + * + * Assign weight w to elements of tsin that are listed in lexemes. + */ +Datum +tsvector_setweight_by_filter(PG_FUNCTION_ARGS) +{ + TSVector tsin = PG_GETARG_TSVECTOR(0); + char char_weight = PG_GETARG_CHAR(1); + ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(2); + + TSVector tsout; + int i, + j, + nlexemes, + weight; + WordEntry *entry; + Datum *dlexemes; + bool *nulls; + + switch (char_weight) + { + case 'A': case 'a': + weight = 3; + break; + case 'B': case 'b': + weight = 2; + break; + case 'C': case 'c': + weight = 1; + break; + case 'D': case 'd': + weight = 0; + break; + default: + /* internal error */ + elog(ERROR, "unrecognized weight: %c", char_weight); + } + + tsout = (TSVector) palloc(VARSIZE(tsin)); + memcpy(tsout, tsin, VARSIZE(tsin)); + entry = ARRPTR(tsout); + + deconstruct_array(lexemes, TEXTOID, -1, false, 'i', + &dlexemes, &nulls, &nlexemes); + + /* + * Assuming that lexemes array is significantly shorter than tsvector + * we can iterate through lexemes performing binary search + * of each lexeme from lexemes in tsvector. + */ + for (i = 0; i < nlexemes; i++) + { + char *lex; + int lex_len, + lex_pos; + + if (nulls[i]) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lexeme array may not contain nulls"))); + + lex = VARDATA(dlexemes[i]); + lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]); + lex_pos = tsvector_bsearch(tsout, lex, lex_len); + + if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0) + { + WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos); + while (j--) + { + WEP_SETWEIGHT(*p, weight); + p++; + } + } + } + + PG_FREE_IF_COPY(tsin, 0); + PG_FREE_IF_COPY(lexemes, 2); + + PG_RETURN_POINTER(tsout); +} + #define compareEntry(pa, a, pb, b) \ tsCompareString((pa) + (a)->pos, (a)->len, \ (pb) + (b)->pos, (b)->len, \ @@ -291,6 +377,483 @@ add_pos(TSVector src, WordEntry *srcptr, return *clen - startlen; } +/* + * Perform binary search of given lexeme in TSVector. + * Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't + * found. + */ +static int +tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len) +{ + WordEntry *arrin = ARRPTR(tsv); + int StopLow = 0, + StopHigh = tsv->size, + StopMiddle, + cmp; + + while (StopLow < StopHigh) + { + StopMiddle = (StopLow + StopHigh)/2; + + cmp = tsCompareString(lexeme, lexeme_len, + STRPTR(tsv) + arrin[StopMiddle].pos, + arrin[StopMiddle].len, + false); + + if (cmp < 0) + StopHigh = StopMiddle; + else if (cmp > 0) + StopLow = StopMiddle + 1; + else /* found it */ + return StopMiddle; + } + + return -1; +} + +static int +compareint(const void *va, const void *vb) +{ + int32 a = *((const int32 *) va); + int32 b = *((const int32 *) vb); + + if (a == b) + return 0; + return (a > b) ? 1 : -1; +} + +/* + * Internal routine to delete lexemes from TSVector by array of offsets. + * + * int *indices_to_delete -- array of lexeme offsets to delete + * int indices_count -- size of that array + * + * Returns new TSVector without given lexemes along with their positions + * and weights. + */ +static TSVector +tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete, + int indices_count) +{ + TSVector tsout; + WordEntry *arrin = ARRPTR(tsv), + *arrout; + char *data = STRPTR(tsv), + *dataout; + int i, j, k, + curoff; + + /* + * Here we overestimates tsout size, since we don't know exact size + * occupied by positions and weights. We will set exact size later + * after a pass through TSVector. + */ + tsout = (TSVector) palloc0(VARSIZE(tsv)); + arrout = ARRPTR(tsout); + tsout->size = tsv->size - indices_count; + + /* Sort our filter array to simplify membership check later. */ + if (indices_count > 1) + qsort(indices_to_delete, indices_count, sizeof(int), compareint); + + /* + * Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete. + */ + curoff = 0; + dataout = STRPTR(tsout); + for (i = j = k = 0; i < tsv->size; i++) + { + /* + * Here we should check whether current i is present in + * indices_to_delete or not. Since indices_to_delete is already + * sorted we can advance it index only when we have match. + */ + if (k < indices_count && i == indices_to_delete[k]){ + k++; + continue; + } + + /* Copy lexeme, it's positions and weights */ + memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len); + arrout[j].haspos = arrin[i].haspos; + arrout[j].len = arrin[i].len; + arrout[j].pos = curoff; + curoff += arrin[i].len; + if (arrin[i].haspos) + { + int len = POSDATALEN(tsv, arrin+i) * sizeof(WordEntryPos) + + sizeof(uint16); + curoff = SHORTALIGN(curoff); + memcpy(dataout + curoff, + STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len), + len); + curoff += len; + } + + j++; + } + + /* + * After the pass through TSVector k should equals exactly to indices_count. + * If it isn't then the caller provided us with indices outside of + * [0, tsv->size) range and estimation of tsout's size is wrong. + */ + Assert(k == indices_count); + + SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff)); + return tsout; +} + +/* + * Delete given lexeme from tsvector. + * Implementation of user-level delete(tsvector, text). + */ +Datum +tsvector_delete_str(PG_FUNCTION_ARGS) +{ + TSVector tsin = PG_GETARG_TSVECTOR(0), + tsout; + text *tlexeme = PG_GETARG_TEXT_P(1); + char *lexeme = VARDATA(tlexeme); + int lexeme_len = VARSIZE_ANY_EXHDR(tlexeme), + skip_index; + + if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1) + PG_RETURN_POINTER(tsin); + + tsout = tsvector_delete_by_indices(tsin, &skip_index, 1); + + PG_FREE_IF_COPY(tsin, 0); + PG_FREE_IF_COPY(tlexeme, 1); + PG_RETURN_POINTER(tsout); +} + +/* + * Delete given array of lexemes from tsvector. + * Implementation of user-level delete(tsvector, text[]). + */ +Datum +tsvector_delete_arr(PG_FUNCTION_ARGS) +{ + TSVector tsin = PG_GETARG_TSVECTOR(0), + tsout; + ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1); + int i, nlex, + skip_count, + *skip_indices; + Datum *dlexemes; + bool *nulls; + + deconstruct_array(lexemes, TEXTOID, -1, false, 'i', + &dlexemes, &nulls, &nlex); + + /* + * In typical use case array of lexemes to delete is relatively small. + * So here we optimizing things for that scenario: iterate through lexarr + * performing binary search of each lexeme from lexarr in tsvector. + */ + skip_indices = palloc0(nlex * sizeof(int)); + for (i = skip_count = 0; i < nlex; i++) + { + char *lex; + int lex_len, + lex_pos; + + if (nulls[i]) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lexeme array may not contain nulls"))); + + lex = VARDATA(dlexemes[i]); + lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]); + lex_pos = tsvector_bsearch(tsin, lex, lex_len); + + if (lex_pos >= 0) + skip_indices[skip_count++] = lex_pos; + } + + tsout = tsvector_delete_by_indices(tsin, skip_indices, skip_count); + + pfree(skip_indices); + PG_FREE_IF_COPY(tsin, 0); + PG_FREE_IF_COPY(lexemes, 1); + + PG_RETURN_POINTER(tsout); +} + +/* + * Expand tsvector as table with following columns: + * lexeme: lexeme text + * positions: integer array of lexeme positions + * weights: char array of weights corresponding to positions + */ +Datum +tsvector_unnest(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + TSVector tsin; + + if (SRF_IS_FIRSTCALL()) + { + MemoryContext oldcontext; + TupleDesc tupdesc; + + funcctx = SRF_FIRSTCALL_INIT(); + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + tupdesc = CreateTemplateTupleDesc(3, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions", + INT2ARRAYOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights", + TEXTARRAYOID, -1, 0); + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + + funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0); + + MemoryContextSwitchTo(oldcontext); + } + + funcctx = SRF_PERCALL_SETUP(); + tsin = (TSVector) funcctx->user_fctx; + + if (funcctx->call_cntr < tsin->size) + { + WordEntry *arrin = ARRPTR(tsin); + char *data = STRPTR(tsin); + HeapTuple tuple; + int j, + i = funcctx->call_cntr; + bool nulls[] = {false, false, false}; + Datum values[3]; + + values[0] = PointerGetDatum( + cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len) + ); + + if (arrin[i].haspos) + { + WordEntryPosVector *posv; + Datum *positions; + Datum *weights; + char weight; + + /* + * Internally tsvector stores position and weight in the same + * uint16 (2 bits for weight, 14 for position). Here we extract that + * in two separate arrays. + */ + posv = _POSVECPTR(tsin, arrin + i); + positions = palloc(posv->npos * sizeof(Datum)); + weights = palloc(posv->npos * sizeof(Datum)); + for (j = 0; j < posv->npos; j++) + { + positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j])); + weight = 'D' - WEP_GETWEIGHT(posv->pos[j]); + weights[j] = PointerGetDatum( + cstring_to_text_with_len(&weight, 1) + ); + } + + values[1] = PointerGetDatum( + construct_array(positions, posv->npos, INT2OID, 2, true, 's')); + values[2] = PointerGetDatum( + construct_array(weights, posv->npos, TEXTOID, -1, false, 'i')); + } + else + { + nulls[1] = nulls[2] = true; + } + + tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); + SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); + } + else + { + pfree(tsin); + SRF_RETURN_DONE(funcctx); + } +} + +/* + * Convert tsvector to array of lexemes. + */ +Datum +tsvector_to_array(PG_FUNCTION_ARGS) +{ + TSVector tsin = PG_GETARG_TSVECTOR(0); + WordEntry *arrin = ARRPTR(tsin); + Datum elements[tsin->size]; + int i; + ArrayType *array; + + for (i = 0; i < tsin->size; i++) + { + elements[i] = PointerGetDatum( + cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len) + ); + } + + array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i'); + PG_FREE_IF_COPY(tsin, 0); + PG_RETURN_POINTER(array); +} + +/* + * Build tsvector from array of lexemes. + */ +Datum +array_to_tsvector(PG_FUNCTION_ARGS) +{ + ArrayType *v = PG_GETARG_ARRAYTYPE_P(0); + TSVector tsout; + Datum *dlexemes; + WordEntry *arrout; + bool *nulls; + int nitems, + i, + tslen, + datalen = 0; + char *cur; + + deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems); + + for (i = 0; i < nitems; i++) + { + if (nulls[i]) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("lexeme array may not contain nulls"))); + + datalen += VARSIZE_ANY_EXHDR(dlexemes[i]); + } + + tslen = CALCDATASIZE(nitems, datalen); + tsout = (TSVector) palloc0(tslen); + SET_VARSIZE(tsout, tslen); + tsout->size = nitems; + arrout = ARRPTR(tsout); + cur = STRPTR(tsout); + + for (i = 0; i < nitems; i++) + { + char *lex = VARDATA(dlexemes[i]); + int lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]); + + memcpy(cur, lex, lex_len); + arrout[i].haspos = 0; + arrout[i].len = lex_len; + arrout[i].pos = cur - STRPTR(tsout); + cur += lex_len; + } + + PG_FREE_IF_COPY(v, 0); + PG_RETURN_POINTER(tsout); +} + +/* + * Leave only elements with given weights from tsvector. + */ +Datum +tsvector_filter(PG_FUNCTION_ARGS) +{ + TSVector tsin = PG_GETARG_TSVECTOR(0), + tsout; + ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1); + WordEntry *arrin = ARRPTR(tsin), + *arrout; + char *datain = STRPTR(tsin), + *dataout; + Datum *dweights; + bool *nulls; + int nweigths; + int i, j; + char mask = 0, + cur_pos = 0; + + deconstruct_array(weights, CHAROID, 1, true, 'c', + &dweights, &nulls, &nweigths); + + for (i = 0; i < nweigths; i++) + { + char char_weight; + + if (nulls[i]) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("weight array may not contain nulls"))); + + char_weight = DatumGetChar(dweights[i]); + switch (char_weight) + { + case 'A': case 'a': + mask = mask | 8; + break; + case 'B': case 'b': + mask = mask | 4; + break; + case 'C': case 'c': + mask = mask | 2; + break; + case 'D': case 'd': + mask = mask | 1; + break; + default: + /* internal error */ + elog(ERROR, "unrecognized weight: %c", char_weight); + } + } + + tsout = (TSVector) palloc0(VARSIZE(tsin)); + tsout->size = tsin->size; + arrout = ARRPTR(tsout); + dataout = STRPTR(tsout); + + for (i = j = 0; i < tsin->size; i++) + { + WordEntryPosVector *posvin, + *posvout; + int npos = 0; + int k; + + if (!arrin[i].haspos) + continue; + + posvin = _POSVECPTR(tsin, arrin + i); + posvout = (WordEntryPosVector *) + (dataout + SHORTALIGN(cur_pos + arrin[i].len)); + + for (k = 0; k < posvin->npos; k++) + { + if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k]))) + posvout->pos[npos++] = posvin->pos[k]; + } + + if (!npos) /* no satisfactory positions found, so skip that lexeme */ + continue; + + arrout[j].haspos = true; + arrout[j].len = arrin[i].len; + arrout[j].pos = cur_pos; + + memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len); + posvout->npos = npos; + cur_pos += SHORTALIGN(arrin[i].len); + cur_pos += POSDATALEN(tsout, arrout+j) * sizeof(WordEntryPos) + + sizeof(uint16); + j++; + } + + tsout->size = j; + if (dataout != STRPTR(tsout)) + memmove(STRPTR(tsout), dataout, cur_pos); + + SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos)); + + PG_FREE_IF_COPY(tsin, 0); + PG_RETURN_POINTER(tsout); +} Datum tsvector_concat(PG_FUNCTION_ARGS) |