aboutsummaryrefslogtreecommitdiff
path: root/contrib/tsearch2/rank.c
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/tsearch2/rank.c')
-rw-r--r--contrib/tsearch2/rank.c591
1 files changed, 591 insertions, 0 deletions
diff --git a/contrib/tsearch2/rank.c b/contrib/tsearch2/rank.c
new file mode 100644
index 00000000000..b73f400b88e
--- /dev/null
+++ b/contrib/tsearch2/rank.c
@@ -0,0 +1,591 @@
+/*
+ * Relevation
+ * Teodor Sigaev <teodor@sigaev.ru>
+ */
+#include "postgres.h"
+#include <math.h>
+
+#include "access/gist.h"
+#include "access/itup.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/builtins.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "storage/bufpage.h"
+#include "executor/spi.h"
+#include "commands/trigger.h"
+#include "nodes/pg_list.h"
+#include "catalog/namespace.h"
+
+#include "utils/array.h"
+
+#include "tsvector.h"
+#include "query.h"
+#include "common.h"
+
+PG_FUNCTION_INFO_V1(rank);
+Datum rank(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(rank_def);
+Datum rank_def(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(rank_cd);
+Datum rank_cd(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(rank_cd_def);
+Datum rank_cd_def(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(get_covers);
+Datum get_covers(PG_FUNCTION_ARGS);
+
+static float weights[]={0.1, 0.2, 0.4, 1.0};
+
+#define wpos(wep) ( w[ ((WordEntryPos*)(wep))->weight ] )
+
+#define DEF_NORM_METHOD 0
+
+/*
+ * Returns a weight of a word collocation
+ */
+static float4 word_distance ( int4 w ) {
+ if ( w>100 )
+ return 1e-30;
+
+ return 1.0/(1.005+0.05*exp( ((float4)w)/1.5-2) );
+}
+
+static int
+cnt_length( tsvector *t ) {
+ WordEntry *ptr=ARRPTR(t), *end=(WordEntry*)STRPTR(t);
+ int len = 0, clen;
+
+ while(ptr < end) {
+ if ( (clen=POSDATALEN(t, ptr)) == 0 )
+ len += 1;
+ else
+ len += clen;
+ ptr++;
+ }
+
+ return len;
+}
+
+static int4
+WordECompareITEM(char *eval, char *qval, WordEntry * ptr, ITEM * item) {
+ if (ptr->len == item->length)
+ return strncmp(
+ eval + ptr->pos,
+ qval + item->distance,
+ item->length);
+
+ return (ptr->len > item->length) ? 1 : -1;
+}
+
+static WordEntry*
+find_wordentry(tsvector *t, QUERYTYPE *q, ITEM *item) {
+ WordEntry *StopLow = ARRPTR(t);
+ WordEntry *StopHigh = (WordEntry*)STRPTR(t);
+ WordEntry *StopMiddle;
+ int difference;
+
+ /* Loop invariant: StopLow <= item < StopHigh */
+
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = StopLow + (StopHigh - StopLow) / 2;
+ difference = WordECompareITEM(STRPTR(t), GETOPERAND(q), StopMiddle, item);
+ if (difference == 0)
+ return StopMiddle;
+ else if (difference < 0)
+ StopLow = StopMiddle + 1;
+ else
+ StopHigh = StopMiddle;
+ }
+
+ return NULL;
+}
+
+static WordEntryPos POSNULL[]={
+ {0,0},
+ {0,MAXENTRYPOS-1}
+};
+
+static float
+calc_rank_and(float *w, tsvector *t, QUERYTYPE *q) {
+ uint16 **pos=(uint16**)palloc(sizeof(uint16*) * q->size);
+ int i,k,l,p;
+ WordEntry *entry;
+ WordEntryPos *post,*ct;
+ int4 dimt,lenct,dist;
+ float res=-1.0;
+ ITEM *item=GETQUERY(q);
+
+ memset(pos,0,sizeof(uint16**) * q->size);
+ *(uint16*)POSNULL = lengthof(POSNULL)-1;
+
+ for(i=0; i<q->size; i++) {
+
+ if ( item[i].type != VAL )
+ continue;
+
+ entry=find_wordentry(t,q,&(item[i]));
+ if ( !entry )
+ continue;
+
+ if ( entry->haspos )
+ pos[i] = (uint16*)_POSDATAPTR(t,entry);
+ else
+ pos[i] = (uint16*)POSNULL;
+
+
+ dimt = *(uint16*)(pos[i]);
+ post = (WordEntryPos*)(pos[i]+1);
+ for( k=0; k<i; k++ ) {
+ if ( !pos[k] ) continue;
+ lenct = *(uint16*)(pos[k]);
+ ct = (WordEntryPos*)(pos[k]+1);
+ for(l=0; l<dimt; l++) {
+ for(p=0; p<lenct; p++) {
+ dist = abs( post[l].pos - ct[p].pos );
+ if ( dist || (dist==0 && (pos[i]==(uint16*)POSNULL || pos[k]==(uint16*)POSNULL) ) ) {
+ float curw;
+ if ( !dist ) dist=MAXENTRYPOS;
+ curw= sqrt( wpos(&(post[l])) * wpos( &(ct[p]) ) * word_distance(dist) );
+ res = ( res < 0 ) ? curw : 1.0 - ( 1.0 - res ) * ( 1.0 - curw );
+ }
+ }
+ }
+ }
+ }
+ pfree(pos);
+ return res;
+}
+
+static float
+calc_rank_or(float *w, tsvector *t, QUERYTYPE *q) {
+ WordEntry *entry;
+ WordEntryPos *post;
+ int4 dimt,j,i;
+ float res=-1.0;
+ ITEM *item=GETQUERY(q);
+
+ *(uint16*)POSNULL = lengthof(POSNULL)-1;
+
+ for(i=0; i<q->size; i++) {
+ if ( item[i].type != VAL )
+ continue;
+
+ entry=find_wordentry(t,q,&(item[i]));
+ if ( !entry )
+ continue;
+
+ if ( entry->haspos ) {
+ dimt = POSDATALEN(t,entry);
+ post = POSDATAPTR(t,entry);
+ } else {
+ dimt = *(uint16*)POSNULL;
+ post = POSNULL+1;
+ }
+
+ for(j=0;j<dimt;j++) {
+ if ( res < 0 )
+ res = wpos( &(post[j]) );
+ else
+ res = 1.0 - ( 1.0-res ) * ( 1.0-wpos( &(post[j]) ) );
+ }
+ }
+ return res;
+}
+
+static float
+calc_rank(float *w, tsvector *t, QUERYTYPE *q, int4 method) {
+ ITEM *item = GETQUERY(q);
+ float res=0.0;
+
+ if (!t->size || !q->size)
+ return 0.0;
+
+ res = ( item->type != VAL && item->val == (int4) '&' ) ?
+ calc_rank_and(w,t,q) : calc_rank_or(w,t,q);
+
+ if ( res < 0 )
+ res = 1e-20;
+
+ switch(method) {
+ case 0: break;
+ case 1: res /= log((float)cnt_length(t)); break;
+ case 2: res /= (float)cnt_length(t); break;
+ default:
+ elog(ERROR,"Unknown normalization method: %d",method);
+ }
+
+ return res;
+}
+
+Datum
+rank(PG_FUNCTION_ARGS) {
+ ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
+ tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
+ QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(2));
+ int method=DEF_NORM_METHOD;
+ float res=0.0;
+ float ws[ lengthof(weights) ];
+ int i;
+
+ if ( ARR_NDIM(win) != 1 )
+ elog(ERROR,"Array of weight is not one dimentional");
+ if ( ARRNELEMS(win) < lengthof(weights) )
+ elog(ERROR,"Array of weight is too short");
+
+ for(i=0;i<lengthof(weights);i++) {
+ ws[ i ] = ( ((float4*)ARR_DATA_PTR(win))[i] >= 0 ) ? ((float4*)ARR_DATA_PTR(win))[i] : weights[i];
+ if ( ws[ i ] > 1.0 )
+ elog(ERROR,"Weight out of range");
+ }
+
+ if ( PG_NARGS() == 4 )
+ method=PG_GETARG_INT32(3);
+
+ res=calc_rank(ws, txt, query, method);
+
+ PG_FREE_IF_COPY(win, 0);
+ PG_FREE_IF_COPY(txt, 1);
+ PG_FREE_IF_COPY(query, 2);
+ PG_RETURN_FLOAT4(res);
+}
+
+Datum
+rank_def(PG_FUNCTION_ARGS) {
+ tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
+ QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
+ float res=0.0;
+ int method=DEF_NORM_METHOD;
+
+ if ( PG_NARGS() == 3 )
+ method=PG_GETARG_INT32(2);
+
+ res=calc_rank(weights, txt, query, method);
+
+ PG_FREE_IF_COPY(txt, 0);
+ PG_FREE_IF_COPY(query, 1);
+ PG_RETURN_FLOAT4(res);
+}
+
+
+typedef struct {
+ ITEM *item;
+ int32 pos;
+} DocRepresentation;
+
+static int
+compareDocR(const void *a, const void *b) {
+ if ( ((DocRepresentation *) a)->pos == ((DocRepresentation *) b)->pos )
+ return 1;
+ return ( ((DocRepresentation *) a)->pos > ((DocRepresentation *) b)->pos ) ? 1 : -1;
+}
+
+
+typedef struct {
+ DocRepresentation *doc;
+ int len;
+} ChkDocR;
+
+static bool
+checkcondition_DR(void *checkval, ITEM *val) {
+ DocRepresentation *ptr = ((ChkDocR*)checkval)->doc;
+
+ while( ptr - ((ChkDocR*)checkval)->doc < ((ChkDocR*)checkval)->len ) {
+ if ( val == ptr->item )
+ return true;
+ ptr++;
+ }
+
+ return false;
+}
+
+
+static bool
+Cover(DocRepresentation *doc, int len, QUERYTYPE *query, int *pos, int *p, int *q) {
+ int i;
+ DocRepresentation *ptr,*f=(DocRepresentation*)0xffffffff;
+ ITEM *item=GETQUERY(query);
+ int lastpos=*pos;
+ int oldq=*q;
+
+ *p=0x7fffffff;
+ *q=0;
+
+ for(i=0; i<query->size; i++) {
+ if ( item->type != VAL ) {
+ item++;
+ continue;
+ }
+ ptr = doc + *pos;
+
+ while(ptr-doc<len) {
+ if ( ptr->item == item ) {
+ if ( ptr->pos > *q ) {
+ *q = ptr->pos;
+ lastpos= ptr - doc;
+ }
+ break;
+ }
+ ptr++;
+ }
+
+ item++;
+ }
+
+ if (*q==0 )
+ return false;
+
+ if (*q==oldq) { /* already check this pos */
+ (*pos)++;
+ return Cover(doc, len, query, pos,p,q);
+ }
+
+ item=GETQUERY(query);
+ for(i=0; i<query->size; i++) {
+ if ( item->type != VAL ) {
+ item++;
+ continue;
+ }
+ ptr = doc + lastpos;
+
+ while(ptr>=doc+*pos) {
+ if ( ptr->item == item ) {
+ if ( ptr->pos < *p ) {
+ *p = ptr->pos;
+ f=ptr;
+ }
+ break;
+ }
+ ptr--;
+ }
+ item++;
+ }
+
+ if ( *p<=*q ) {
+ ChkDocR ch = { f, (doc + lastpos)-f+1 };
+ *pos = f-doc+1;
+ if ( TS_execute(GETQUERY(query), &ch, false, checkcondition_DR) ) {
+ /*elog(NOTICE,"OP:%d NP:%d P:%d Q:%d", *pos, lastpos, *p, *q);*/
+ return true;
+ } else
+ return Cover(doc, len, query, pos,p,q);
+ }
+
+ return false;
+}
+
+static DocRepresentation*
+get_docrep(tsvector *txt, QUERYTYPE *query, int *doclen) {
+ ITEM *item=GETQUERY(query);
+ WordEntry *entry;
+ WordEntryPos *post;
+ int4 dimt,j,i;
+ int len=query->size*4,cur=0;
+ DocRepresentation *doc;
+
+ *(uint16*)POSNULL = lengthof(POSNULL)-1;
+ doc = (DocRepresentation*)palloc(sizeof(DocRepresentation)*len);
+ for(i=0; i<query->size; i++) {
+ if ( item[i].type != VAL )
+ continue;
+
+ entry=find_wordentry(txt,query,&(item[i]));
+ if ( !entry )
+ continue;
+
+ if ( entry->haspos ) {
+ dimt = POSDATALEN(txt,entry);
+ post = POSDATAPTR(txt,entry);
+ } else {
+ dimt = *(uint16*)POSNULL;
+ post = POSNULL+1;
+ }
+
+ while( cur+dimt >= len ) {
+ len*=2;
+ doc = (DocRepresentation*)repalloc(doc,sizeof(DocRepresentation)*len);
+ }
+
+ for(j=0;j<dimt;j++) {
+ doc[cur].item=&(item[i]);
+ doc[cur].pos=post[j].pos;
+ cur++;
+ }
+ }
+
+ *doclen=cur;
+
+ if ( cur>0 ) {
+ if ( cur>1 )
+ qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR);
+ return doc;
+ }
+
+ pfree(doc);
+ return NULL;
+}
+
+
+Datum
+rank_cd(PG_FUNCTION_ARGS) {
+ int K = PG_GETARG_INT32(0);
+ tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
+ QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(2));
+ int method=DEF_NORM_METHOD;
+ DocRepresentation *doc;
+ float res=0.0;
+ int p=0,q=0,len,cur;
+
+ doc = get_docrep(txt, query, &len);
+ if ( !doc ) {
+ PG_FREE_IF_COPY(txt, 1);
+ PG_FREE_IF_COPY(query, 2);
+ PG_RETURN_FLOAT4(0.0);
+ }
+
+ cur=0;
+ if (K<=0)
+ K=4;
+ while( Cover(doc, len, query, &cur, &p, &q) )
+ res += ( q-p+1 > K ) ? ((float)K)/((float)(q-p+1)) : 1.0;
+
+ if ( PG_NARGS() == 4 )
+ method=PG_GETARG_INT32(3);
+
+ switch(method) {
+ case 0: break;
+ case 1: res /= log((float)cnt_length(txt)); break;
+ case 2: res /= (float)cnt_length(txt); break;
+ default:
+ elog(ERROR,"Unknown normalization method: %d",method);
+ }
+
+ pfree(doc);
+ PG_FREE_IF_COPY(txt, 1);
+ PG_FREE_IF_COPY(query, 2);
+
+ PG_RETURN_FLOAT4(res);
+}
+
+
+Datum
+rank_cd_def(PG_FUNCTION_ARGS) {
+ PG_RETURN_DATUM( DirectFunctionCall4(
+ rank_cd,
+ Int32GetDatum(-1),
+ PG_GETARG_DATUM(0),
+ PG_GETARG_DATUM(1),
+ ( PG_NARGS() == 3 ) ? PG_GETARG_DATUM(2) : Int32GetDatum(DEF_NORM_METHOD)
+ ));
+}
+
+/**************debug*************/
+
+typedef struct {
+ char *w;
+ int2 len;
+ int2 pos;
+ int2 start;
+ int2 finish;
+} DocWord;
+
+static int
+compareDocWord(const void *a, const void *b) {
+ if ( ((DocWord *) a)->pos == ((DocWord *) b)->pos )
+ return 1;
+ return ( ((DocWord *) a)->pos > ((DocWord *) b)->pos ) ? 1 : -1;
+}
+
+
+Datum
+get_covers(PG_FUNCTION_ARGS) {
+ tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
+ QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
+ WordEntry *pptr=ARRPTR(txt);
+ int i,dlen=0,j,cur=0,len=0,rlen;
+ DocWord *dw,*dwptr;
+ text *out;
+ char *cptr;
+ DocRepresentation *doc;
+ int pos=0,p,q,olddwpos=0;
+ int ncover=1;
+
+ doc = get_docrep(txt, query, &rlen);
+
+ if ( !doc ) {
+ out=palloc(VARHDRSZ);
+ VARATT_SIZEP(out) = VARHDRSZ;
+ PG_FREE_IF_COPY(txt,0);
+ PG_FREE_IF_COPY(query,1);
+ PG_RETURN_POINTER(out);
+ }
+
+ for(i=0;i<txt->size;i++) {
+ if (!pptr[i].haspos)
+ elog(ERROR,"No pos info");
+ dlen += POSDATALEN(txt,&(pptr[i]));
+ }
+
+ dwptr=dw=palloc(sizeof(DocWord)*dlen);
+ memset(dw,0,sizeof(DocWord)*dlen);
+
+ for(i=0;i<txt->size;i++) {
+ WordEntryPos *posdata = POSDATAPTR(txt,&(pptr[i]));
+ for(j=0;j<POSDATALEN(txt,&(pptr[i]));j++) {
+ dw[cur].w=STRPTR(txt)+pptr[i].pos;
+ dw[cur].len=pptr[i].len;
+ dw[cur].pos=posdata[j].pos;
+ cur++;
+ }
+ len+=(pptr[i].len + 1) * (int)POSDATALEN(txt,&(pptr[i]));
+ }
+ qsort((void *) dw, dlen, sizeof(DocWord), compareDocWord);
+
+ while( Cover(doc, rlen, query, &pos, &p, &q) ) {
+ dwptr=dw+olddwpos;
+ while(dwptr->pos < p && dwptr-dw<dlen)
+ dwptr++;
+ olddwpos=dwptr-dw;
+ dwptr->start=ncover;
+ while(dwptr->pos < q+1 && dwptr-dw<dlen)
+ dwptr++;
+ (dwptr-1)->finish=ncover;
+ len+= 4 /* {}+two spaces */ + 2*16 /*numbers*/;
+ ncover++;
+ }
+
+ out=palloc(VARHDRSZ+len);
+ cptr=((char*)out)+VARHDRSZ;
+ dwptr=dw;
+
+ while( dwptr-dw < dlen) {
+ if ( dwptr->start ) {
+ sprintf(cptr,"{%d ",dwptr->start);
+ cptr=strchr(cptr,'\0');
+ }
+ memcpy(cptr,dwptr->w,dwptr->len);
+ cptr+=dwptr->len;
+ *cptr=' ';
+ cptr++;
+ if ( dwptr->finish ) {
+ sprintf(cptr,"}%d ",dwptr->finish);
+ cptr=strchr(cptr,'\0');
+ }
+ dwptr++;
+ }
+
+ VARATT_SIZEP(out) = cptr - ((char*)out);
+
+ pfree(dw);
+ pfree(doc);
+
+ PG_FREE_IF_COPY(txt,0);
+ PG_FREE_IF_COPY(query,1);
+ PG_RETURN_POINTER(out);
+}
+