diff options
Diffstat (limited to 'src/backend/utils/adt/tsrank.c')
-rw-r--r-- | src/backend/utils/adt/tsrank.c | 804 |
1 files changed, 804 insertions, 0 deletions
diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c new file mode 100644 index 00000000000..8b2ab884c8c --- /dev/null +++ b/src/backend/utils/adt/tsrank.c @@ -0,0 +1,804 @@ +/*------------------------------------------------------------------------- + * + * tsrank.c + * rank tsvector by tsquery + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <math.h> + +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" +#include "utils/array.h" + + +static float weights[] = {0.1, 0.2, 0.4, 1.0}; + +#define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] ) + +#define RANK_NO_NORM 0x00 +#define RANK_NORM_LOGLENGTH 0x01 +#define RANK_NORM_LENGTH 0x02 +#define RANK_NORM_EXTDIST 0x04 +#define RANK_NORM_UNIQ 0x08 +#define RANK_NORM_LOGUNIQ 0x10 +#define DEF_NORM_METHOD RANK_NO_NORM + +static float calc_rank_or(float *w, TSVector t, TSQuery q); +static float calc_rank_and(float *w, TSVector t, TSQuery q); + +/* + * Returns a weight of a word collocation + */ +static float4 +word_distance(int4 w) +{ + if (w > 100) + return 1e-30; + + return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2)); +} + +static int +cnt_length(TSVector t) +{ + WordEntry *ptr = ARRPTR(t), + *end = (WordEntry *) STRPTR(t); + int len = 0, + clen; + + while (ptr < end) + { + if ((clen = POSDATALEN(t, ptr)) == 0) + len += 1; + else + len += clen; + ptr++; + } + + return len; +} + +static int4 +WordECompareQueryItem(char *eval, char *qval, WordEntry * ptr, QueryItem * item) +{ + if (ptr->len == item->length) + return strncmp( + eval + ptr->pos, + qval + item->distance, + item->length); + + return (ptr->len > item->length) ? 1 : -1; +} + +static WordEntry * +find_wordentry(TSVector t, TSQuery q, QueryItem * item) +{ + WordEntry *StopLow = ARRPTR(t); + WordEntry *StopHigh = (WordEntry *) STRPTR(t); + WordEntry *StopMiddle; + int difference; + + /* Loop invariant: StopLow <= item < StopHigh */ + + while (StopLow < StopHigh) + { + StopMiddle = StopLow + (StopHigh - StopLow) / 2; + difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item); + if (difference == 0) + return StopMiddle; + else if (difference < 0) + StopLow = StopMiddle + 1; + else + StopHigh = StopMiddle; + } + + return NULL; +} + + +static int +compareQueryItem(const void *a, const void *b, void *arg) +{ + char *operand = (char *) arg; + + if ((*(QueryItem **) a)->length == (*(QueryItem **) b)->length) + return strncmp(operand + (*(QueryItem **) a)->distance, + operand + (*(QueryItem **) b)->distance, + (*(QueryItem **) b)->length); + + return ((*(QueryItem **) a)->length > (*(QueryItem **) b)->length) ? 1 : -1; +} + +static QueryItem ** +SortAndUniqItems(char *operand, QueryItem * item, int *size) +{ + QueryItem **res, + **ptr, + **prevptr; + + ptr = res = (QueryItem **) palloc(sizeof(QueryItem *) * *size); + + while ((*size)--) + { + if (item->type == VAL) + { + *ptr = item; + ptr++; + } + item++; + } + + *size = ptr - res; + if (*size < 2) + return res; + + qsort_arg(res, *size, sizeof(QueryItem **), compareQueryItem, (void *) operand); + + ptr = res + 1; + prevptr = res; + + while (ptr - res < *size) + { + if (compareQueryItem((void *) ptr, (void *) prevptr, (void *) operand) != 0) + { + prevptr++; + *prevptr = *ptr; + } + ptr++; + } + + *size = prevptr + 1 - res; + return res; +} + +static WordEntryPos POSNULL[] = { + 0, + 0 +}; + +static float +calc_rank_and(float *w, TSVector t, TSQuery q) +{ + uint16 **pos; + int i, + k, + l, + p; + WordEntry *entry; + WordEntryPos *post, + *ct; + int4 dimt, + lenct, + dist; + float res = -1.0; + QueryItem **item; + int size = q->size; + + item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size); + if (size < 2) + { + pfree(item); + return calc_rank_or(w, t, q); + } + pos = (uint16 **) palloc(sizeof(uint16 *) * q->size); + memset(pos, 0, sizeof(uint16 *) * q->size); + *(uint16 *) POSNULL = lengthof(POSNULL) - 1; + WEP_SETPOS(POSNULL[1], MAXENTRYPOS - 1); + + for (i = 0; i < size; i++) + { + entry = find_wordentry(t, q, item[i]); + if (!entry) + continue; + + if (entry->haspos) + pos[i] = (uint16 *) _POSDATAPTR(t, entry); + else + pos[i] = (uint16 *) POSNULL; + + + dimt = *(uint16 *) (pos[i]); + post = (WordEntryPos *) (pos[i] + 1); + for (k = 0; k < i; k++) + { + if (!pos[k]) + continue; + lenct = *(uint16 *) (pos[k]); + ct = (WordEntryPos *) (pos[k] + 1); + for (l = 0; l < dimt; l++) + { + for (p = 0; p < lenct; p++) + { + dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p])); + if (dist || (dist == 0 && (pos[i] == (uint16 *) POSNULL || pos[k] == (uint16 *) POSNULL))) + { + float curw; + + if (!dist) + dist = MAXENTRYPOS; + curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist)); + res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw); + } + } + } + } + } + pfree(pos); + pfree(item); + return res; +} + +static float +calc_rank_or(float *w, TSVector t, TSQuery q) +{ + WordEntry *entry; + WordEntryPos *post; + int4 dimt, + j, + i; + float res = 0.0; + QueryItem **item; + int size = q->size; + + *(uint16 *) POSNULL = lengthof(POSNULL) - 1; + item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size); + + for (i = 0; i < size; i++) + { + float resj, + wjm; + int4 jm; + + entry = find_wordentry(t, q, item[i]); + if (!entry) + continue; + + if (entry->haspos) + { + dimt = POSDATALEN(t, entry); + post = POSDATAPTR(t, entry); + } + else + { + dimt = *(uint16 *) POSNULL; + post = POSNULL + 1; + } + + resj = 0.0; + wjm = -1.0; + jm = 0; + for (j = 0; j < dimt; j++) + { + resj = resj + wpos(post[j]) / ((j + 1) * (j + 1)); + if (wpos(post[j]) > wjm) + { + wjm = wpos(post[j]); + jm = j; + } + } +/* + limit (sum(i/i^2),i->inf) = pi^2/6 + resj = sum(wi/i^2),i=1,noccurence, + wi - should be sorted desc, + don't sort for now, just choose maximum weight. This should be corrected + Oleg Bartunov +*/ + res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685; + } + if (size > 0) + res = res / size; + pfree(item); + return res; +} + +static float +calc_rank(float *w, TSVector t, TSQuery q, int4 method) +{ + QueryItem *item = GETQUERY(q); + float res = 0.0; + int len; + + if (!t->size || !q->size) + return 0.0; + + res = (item->type != VAL && item->val == (int4) '&') ? + calc_rank_and(w, t, q) : calc_rank_or(w, t, q); + + if (res < 0) + res = 1e-20; + + if ((method & RANK_NORM_LOGLENGTH) && t->size > 0) + res /= log((double) (cnt_length(t) + 1)) / log(2.0); + + if (method & RANK_NORM_LENGTH) + { + len = cnt_length(t); + if (len > 0) + res /= (float) len; + } + + if ((method & RANK_NORM_UNIQ) && t->size > 0) + res /= (float) (t->size); + + if ((method & RANK_NORM_LOGUNIQ) && t->size > 0) + res /= log((double) (t->size + 1)) / log(2.0); + + return res; +} + +static float * +getWeights(ArrayType *win) +{ + static float ws[lengthof(weights)]; + int i; + float4 *arrdata; + + if (win == 0) + return weights; + + if (ARR_NDIM(win) != 1) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("array of weight must be one-dimensional"))); + + if (ArrayGetNItems(ARR_NDIM(win), ARR_DIMS(win)) < lengthof(weights)) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("array of weight is too short"))); + + if (ARR_HASNULL(win)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("array of weight must not contain nulls"))); + + arrdata = (float4 *) ARR_DATA_PTR(win); + for (i = 0; i < lengthof(weights); i++) + { + ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i]; + if (ws[i] > 1.0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("weight out of range"))); + } + + return ws; +} + +Datum +ts_rank_wttf(PG_FUNCTION_ARGS) +{ + ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); + TSVector txt = PG_GETARG_TSVECTOR(1); + TSQuery query = PG_GETARG_TSQUERY(2); + int method = PG_GETARG_INT32(3); + float res; + + res = calc_rank(getWeights(win), txt, query, method); + + PG_FREE_IF_COPY(win, 0); + PG_FREE_IF_COPY(txt, 1); + PG_FREE_IF_COPY(query, 2); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rank_wtt(PG_FUNCTION_ARGS) +{ + ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); + TSVector txt = PG_GETARG_TSVECTOR(1); + TSQuery query = PG_GETARG_TSQUERY(2); + float res; + + res = calc_rank(getWeights(win), txt, query, DEF_NORM_METHOD); + + PG_FREE_IF_COPY(win, 0); + PG_FREE_IF_COPY(txt, 1); + PG_FREE_IF_COPY(query, 2); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rank_ttf(PG_FUNCTION_ARGS) +{ + TSVector txt = PG_GETARG_TSVECTOR(0); + TSQuery query = PG_GETARG_TSQUERY(1); + int method = PG_GETARG_INT32(2); + float res; + + res = calc_rank(getWeights(NULL), txt, query, method); + + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rank_tt(PG_FUNCTION_ARGS) +{ + TSVector txt = PG_GETARG_TSVECTOR(0); + TSQuery query = PG_GETARG_TSQUERY(1); + float res; + + res = calc_rank(getWeights(NULL), txt, query, DEF_NORM_METHOD); + + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_FLOAT4(res); +} + +typedef struct +{ + QueryItem **item; + int16 nitem; + bool needfree; + uint8 wclass; + int32 pos; +} DocRepresentation; + +static int +compareDocR(const void *a, const void *b) +{ + if (((DocRepresentation *) a)->pos == ((DocRepresentation *) b)->pos) + return 0; + return (((DocRepresentation *) a)->pos > ((DocRepresentation *) b)->pos) ? 1 : -1; +} + +static bool +checkcondition_QueryItem(void *checkval, QueryItem * val) +{ + return (bool) (val->istrue); +} + +static void +reset_istrue_flag(TSQuery query) +{ + QueryItem *item = GETQUERY(query); + int i; + + /* reset istrue flag */ + for (i = 0; i < query->size; i++) + { + if (item->type == VAL) + item->istrue = 0; + item++; + } +} + +typedef struct +{ + int pos; + int p; + int q; + DocRepresentation *begin; + DocRepresentation *end; +} Extention; + + +static bool +Cover(DocRepresentation * doc, int len, TSQuery query, Extention * ext) +{ + DocRepresentation *ptr; + int lastpos = ext->pos; + int i; + bool found = false; + + reset_istrue_flag(query); + + ext->p = 0x7fffffff; + ext->q = 0; + ptr = doc + ext->pos; + + /* find upper bound of cover from current position, move up */ + while (ptr - doc < len) + { + for (i = 0; i < ptr->nitem; i++) + ptr->item[i]->istrue = 1; + if (TS_execute(GETQUERY(query), NULL, false, checkcondition_QueryItem)) + { + if (ptr->pos > ext->q) + { + ext->q = ptr->pos; + ext->end = ptr; + lastpos = ptr - doc; + found = true; + } + break; + } + ptr++; + } + + if (!found) + return false; + + reset_istrue_flag(query); + + ptr = doc + lastpos; + + /* find lower bound of cover from founded upper bound, move down */ + while (ptr >= doc + ext->pos) + { + for (i = 0; i < ptr->nitem; i++) + ptr->item[i]->istrue = 1; + if (TS_execute(GETQUERY(query), NULL, true, checkcondition_QueryItem)) + { + if (ptr->pos < ext->p) + { + ext->begin = ptr; + ext->p = ptr->pos; + } + break; + } + ptr--; + } + + if (ext->p <= ext->q) + { + /* + * set position for next try to next lexeme after begining of founded + * cover + */ + ext->pos = (ptr - doc) + 1; + return true; + } + + ext->pos++; + return Cover(doc, len, query, ext); +} + +static DocRepresentation * +get_docrep(TSVector txt, TSQuery query, int *doclen) +{ + QueryItem *item = GETQUERY(query); + WordEntry *entry; + WordEntryPos *post; + int4 dimt, + j, + i; + int len = query->size * 4, + cur = 0; + DocRepresentation *doc; + char *operand; + + *(uint16 *) POSNULL = lengthof(POSNULL) - 1; + doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len); + operand = GETOPERAND(query); + reset_istrue_flag(query); + + for (i = 0; i < query->size; i++) + { + if (item[i].type != VAL || item[i].istrue) + continue; + + entry = find_wordentry(txt, query, &(item[i])); + if (!entry) + continue; + + if (entry->haspos) + { + dimt = POSDATALEN(txt, entry); + post = POSDATAPTR(txt, entry); + } + else + { + dimt = *(uint16 *) POSNULL; + post = POSNULL + 1; + } + + while (cur + dimt >= len) + { + len *= 2; + doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len); + } + + for (j = 0; j < dimt; j++) + { + if (j == 0) + { + QueryItem *kptr, + *iptr = item + i; + int k; + + doc[cur].needfree = false; + doc[cur].nitem = 0; + doc[cur].item = (QueryItem **) palloc(sizeof(QueryItem *) * query->size); + + for (k = 0; k < query->size; k++) + { + kptr = item + k; + if (k == i || + (item[k].type == VAL && + compareQueryItem(&kptr, &iptr, operand) == 0)) + { + doc[cur].item[doc[cur].nitem] = item + k; + doc[cur].nitem++; + kptr->istrue = 1; + } + } + } + else + { + doc[cur].needfree = false; + doc[cur].nitem = doc[cur - 1].nitem; + doc[cur].item = doc[cur - 1].item; + } + doc[cur].pos = WEP_GETPOS(post[j]); + doc[cur].wclass = WEP_GETWEIGHT(post[j]); + cur++; + } + } + + *doclen = cur; + + if (cur > 0) + { + if (cur > 1) + qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR); + return doc; + } + + pfree(doc); + return NULL; +} + +static float4 +calc_rank_cd(float4 *arrdata, TSVector txt, TSQuery query, int method) +{ + DocRepresentation *doc; + int len, + i, + doclen = 0; + Extention ext; + double Wdoc = 0.0; + double invws[lengthof(weights)]; + double SumDist = 0.0, + PrevExtPos = 0.0, + CurExtPos = 0.0; + int NExtent = 0; + + for (i = 0; i < lengthof(weights); i++) + { + invws[i] = ((double) ((arrdata[i] >= 0) ? arrdata[i] : weights[i])); + if (invws[i] > 1.0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("weight out of range"))); + invws[i] = 1.0 / invws[i]; + } + + doc = get_docrep(txt, query, &doclen); + if (!doc) + return 0.0; + + MemSet(&ext, 0, sizeof(Extention)); + while (Cover(doc, doclen, query, &ext)) + { + double Cpos = 0.0; + double InvSum = 0.0; + int nNoise; + DocRepresentation *ptr = ext.begin; + + while (ptr <= ext.end) + { + InvSum += invws[ptr->wclass]; + ptr++; + } + + Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum; + + /* + * if doc are big enough then ext.q may be equal to ext.p due to limit + * of posional information. In this case we approximate number of + * noise word as half cover's length + */ + nNoise = (ext.q - ext.p) - (ext.end - ext.begin); + if (nNoise < 0) + nNoise = (ext.end - ext.begin) / 2; + Wdoc += Cpos / ((double) (1 + nNoise)); + + CurExtPos = ((double) (ext.q + ext.p)) / 2.0; + if (NExtent > 0 && CurExtPos > PrevExtPos /* prevent devision by + * zero in a case of + multiple lexize */ ) + SumDist += 1.0 / (CurExtPos - PrevExtPos); + + PrevExtPos = CurExtPos; + NExtent++; + } + + if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0) + Wdoc /= log((double) (cnt_length(txt) + 1)); + + if (method & RANK_NORM_LENGTH) + { + len = cnt_length(txt); + if (len > 0) + Wdoc /= (double) len; + } + + if ((method & RANK_NORM_EXTDIST) && SumDist > 0) + Wdoc /= ((double) NExtent) / SumDist; + + if ((method & RANK_NORM_UNIQ) && txt->size > 0) + Wdoc /= (double) (txt->size); + + if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0) + Wdoc /= log((double) (txt->size + 1)) / log(2.0); + + for (i = 0; i < doclen; i++) + if (doc[i].needfree) + pfree(doc[i].item); + pfree(doc); + + return (float4) Wdoc; +} + +Datum +ts_rankcd_wttf(PG_FUNCTION_ARGS) +{ + ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); + TSVector txt = PG_GETARG_TSVECTOR(1); + TSQuery query = PG_GETARG_TSQUERY_COPY(2); + int method = PG_GETARG_INT32(3); + float res; + + res = calc_rank_cd(getWeights(win), txt, query, method); + + PG_FREE_IF_COPY(win, 0); + PG_FREE_IF_COPY(txt, 1); + PG_FREE_IF_COPY(query, 2); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rankcd_wtt(PG_FUNCTION_ARGS) +{ + ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); + TSVector txt = PG_GETARG_TSVECTOR(1); + TSQuery query = PG_GETARG_TSQUERY_COPY(2); + float res; + + res = calc_rank_cd(getWeights(win), txt, query, DEF_NORM_METHOD); + + PG_FREE_IF_COPY(win, 0); + PG_FREE_IF_COPY(txt, 1); + PG_FREE_IF_COPY(query, 2); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rankcd_ttf(PG_FUNCTION_ARGS) +{ + TSVector txt = PG_GETARG_TSVECTOR(0); + TSQuery query = PG_GETARG_TSQUERY_COPY(1); + int method = PG_GETARG_INT32(2); + float res; + + res = calc_rank_cd(getWeights(NULL), txt, query, method); + + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rankcd_tt(PG_FUNCTION_ARGS) +{ + TSVector txt = PG_GETARG_TSVECTOR(0); + TSQuery query = PG_GETARG_TSQUERY_COPY(1); + float res; + + res = calc_rank_cd(getWeights(NULL), txt, query, DEF_NORM_METHOD); + + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_FLOAT4(res); +} |