diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2007-08-21 01:11:32 +0000 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2007-08-21 01:11:32 +0000 |
commit | 140d4ebcb46e17cdb1be43892ed797e5e060c8ef (patch) | |
tree | f99d209dbe5e40dcb434c3841e0c8b4ff383f453 /src/backend/utils/adt | |
parent | 4e94d1f952c3ce5670ceae3c12b55e344503a701 (diff) | |
download | postgresql-140d4ebcb46e17cdb1be43892ed797e5e060c8ef.tar.gz postgresql-140d4ebcb46e17cdb1be43892ed797e5e060c8ef.zip |
Tsearch2 functionality migrates to core. The bulk of this work is by
Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing,
so anything that's broken is probably my fault.
Documentation is nonexistent as yet, but let's land the patch so we can
get some portability testing done.
Diffstat (limited to 'src/backend/utils/adt')
-rw-r--r-- | src/backend/utils/adt/Makefile | 9 | ||||
-rw-r--r-- | src/backend/utils/adt/regproc.c | 229 | ||||
-rw-r--r-- | src/backend/utils/adt/selfuncs.c | 6 | ||||
-rw-r--r-- | src/backend/utils/adt/tsginidx.c | 157 | ||||
-rw-r--r-- | src/backend/utils/adt/tsgistidx.c | 784 | ||||
-rw-r--r-- | src/backend/utils/adt/tsquery.c | 767 | ||||
-rw-r--r-- | src/backend/utils/adt/tsquery_cleanup.c | 261 | ||||
-rw-r--r-- | src/backend/utils/adt/tsquery_gist.c | 259 | ||||
-rw-r--r-- | src/backend/utils/adt/tsquery_op.c | 289 | ||||
-rw-r--r-- | src/backend/utils/adt/tsquery_rewrite.c | 524 | ||||
-rw-r--r-- | src/backend/utils/adt/tsquery_util.c | 317 | ||||
-rw-r--r-- | src/backend/utils/adt/tsrank.c | 804 | ||||
-rw-r--r-- | src/backend/utils/adt/tsvector.c | 683 | ||||
-rw-r--r-- | src/backend/utils/adt/tsvector_op.c | 1334 |
14 files changed, 6418 insertions, 5 deletions
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index 12d158a49bd..73fcebb496d 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -1,7 +1,7 @@ # # Makefile for utils/adt # -# $PostgreSQL: pgsql/src/backend/utils/adt/Makefile,v 1.64 2007/04/02 03:49:39 tgl Exp $ +# $PostgreSQL: pgsql/src/backend/utils/adt/Makefile,v 1.65 2007/08/21 01:11:18 tgl Exp $ # subdir = src/backend/utils/adt @@ -25,8 +25,11 @@ OBJS = acl.o arrayfuncs.o array_userfuncs.o arrayutils.o bool.o \ tid.o timestamp.o varbit.o varchar.o varlena.o version.o xid.o \ network.o mac.o inet_net_ntop.o inet_net_pton.o \ ri_triggers.o pg_lzcompress.o pg_locale.o formatting.o \ - ascii.o quote.o pgstatfuncs.o encode.o dbsize.o genfile.o xml.o \ - uuid.o + ascii.o quote.o pgstatfuncs.o encode.o dbsize.o genfile.o \ + tsginidx.o tsgistidx.o tsquery.o tsquery_cleanup.o tsquery_gist.o \ + tsquery_op.o tsquery_rewrite.o tsquery_util.o tsrank.o \ + tsvector.o tsvector_op.o \ + uuid.o xml.o like.o: like.c like_match.c diff --git a/src/backend/utils/adt/regproc.c b/src/backend/utils/adt/regproc.c index 7ec9db3ec9b..e49f323daa8 100644 --- a/src/backend/utils/adt/regproc.c +++ b/src/backend/utils/adt/regproc.c @@ -13,7 +13,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/regproc.c,v 1.102 2007/06/26 16:48:09 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/regproc.c,v 1.103 2007/08/21 01:11:18 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -27,6 +27,8 @@ #include "catalog/namespace.h" #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" +#include "catalog/pg_ts_config.h" +#include "catalog/pg_ts_dict.h" #include "catalog/pg_type.h" #include "miscadmin.h" #include "parser/parse_type.h" @@ -1066,6 +1068,231 @@ regtypesend(PG_FUNCTION_ARGS) /* + * regconfigin - converts "tsconfigname" to tsconfig OID + * + * We also accept a numeric OID, for symmetry with the output routine. + * + * '-' signifies unknown (OID 0). In all other cases, the input must + * match an existing pg_ts_config entry. + * + * This function is not needed in bootstrap mode, so we don't worry about + * making it work then. + */ +Datum +regconfigin(PG_FUNCTION_ARGS) +{ + char *cfg_name_or_oid = PG_GETARG_CSTRING(0); + Oid result; + List *names; + + /* '-' ? */ + if (strcmp(cfg_name_or_oid, "-") == 0) + PG_RETURN_OID(InvalidOid); + + /* Numeric OID? */ + if (cfg_name_or_oid[0] >= '0' && + cfg_name_or_oid[0] <= '9' && + strspn(cfg_name_or_oid, "0123456789") == strlen(cfg_name_or_oid)) + { + result = DatumGetObjectId(DirectFunctionCall1(oidin, + CStringGetDatum(cfg_name_or_oid))); + PG_RETURN_OID(result); + } + + /* + * Normal case: parse the name into components and see if it matches any + * pg_ts_config entries in the current search path. + */ + names = stringToQualifiedNameList(cfg_name_or_oid); + + result = TSConfigGetCfgid(names, false); + + PG_RETURN_OID(result); +} + +/* + * regconfigout - converts tsconfig OID to "tsconfigname" + */ +Datum +regconfigout(PG_FUNCTION_ARGS) +{ + Oid cfgid = PG_GETARG_OID(0); + char *result; + HeapTuple cfgtup; + + if (cfgid == InvalidOid) + { + result = pstrdup("-"); + PG_RETURN_CSTRING(result); + } + + cfgtup = SearchSysCache(TSCONFIGOID, + ObjectIdGetDatum(cfgid), + 0, 0, 0); + + if (HeapTupleIsValid(cfgtup)) + { + Form_pg_ts_config cfgform = (Form_pg_ts_config) GETSTRUCT(cfgtup); + char *cfgname = NameStr(cfgform->cfgname); + char *nspname; + + /* + * Would this config be found by regconfigin? If not, qualify it. + */ + if (TSConfigIsVisible(cfgid)) + nspname = NULL; + else + nspname = get_namespace_name(cfgform->cfgnamespace); + + result = quote_qualified_identifier(nspname, cfgname); + + ReleaseSysCache(cfgtup); + } + else + { + /* If OID doesn't match any pg_ts_config row, return it numerically */ + result = (char *) palloc(NAMEDATALEN); + snprintf(result, NAMEDATALEN, "%u", cfgid); + } + + PG_RETURN_CSTRING(result); +} + +/* + * regconfigrecv - converts external binary format to regconfig + */ +Datum +regconfigrecv(PG_FUNCTION_ARGS) +{ + /* Exactly the same as oidrecv, so share code */ + return oidrecv(fcinfo); +} + +/* + * regconfigsend - converts regconfig to binary format + */ +Datum +regconfigsend(PG_FUNCTION_ARGS) +{ + /* Exactly the same as oidsend, so share code */ + return oidsend(fcinfo); +} + + +/* + * regdictionaryin - converts "tsdictionaryname" to tsdictionary OID + * + * We also accept a numeric OID, for symmetry with the output routine. + * + * '-' signifies unknown (OID 0). In all other cases, the input must + * match an existing pg_ts_dict entry. + * + * This function is not needed in bootstrap mode, so we don't worry about + * making it work then. + */ +Datum +regdictionaryin(PG_FUNCTION_ARGS) +{ + char *dict_name_or_oid = PG_GETARG_CSTRING(0); + Oid result; + List *names; + + /* '-' ? */ + if (strcmp(dict_name_or_oid, "-") == 0) + PG_RETURN_OID(InvalidOid); + + /* Numeric OID? */ + if (dict_name_or_oid[0] >= '0' && + dict_name_or_oid[0] <= '9' && + strspn(dict_name_or_oid, "0123456789") == strlen(dict_name_or_oid)) + { + result = DatumGetObjectId(DirectFunctionCall1(oidin, + CStringGetDatum(dict_name_or_oid))); + PG_RETURN_OID(result); + } + + /* + * Normal case: parse the name into components and see if it matches any + * pg_ts_dict entries in the current search path. + */ + names = stringToQualifiedNameList(dict_name_or_oid); + + result = TSDictionaryGetDictid(names, false); + + PG_RETURN_OID(result); +} + +/* + * regdictionaryout - converts tsdictionary OID to "tsdictionaryname" + */ +Datum +regdictionaryout(PG_FUNCTION_ARGS) +{ + Oid dictid = PG_GETARG_OID(0); + char *result; + HeapTuple dicttup; + + if (dictid == InvalidOid) + { + result = pstrdup("-"); + PG_RETURN_CSTRING(result); + } + + dicttup = SearchSysCache(TSDICTOID, + ObjectIdGetDatum(dictid), + 0, 0, 0); + + if (HeapTupleIsValid(dicttup)) + { + Form_pg_ts_dict dictform = (Form_pg_ts_dict) GETSTRUCT(dicttup); + char *dictname = NameStr(dictform->dictname); + char *nspname; + + /* + * Would this dictionary be found by regdictionaryin? + * If not, qualify it. + */ + if (TSDictionaryIsVisible(dictid)) + nspname = NULL; + else + nspname = get_namespace_name(dictform->dictnamespace); + + result = quote_qualified_identifier(nspname, dictname); + + ReleaseSysCache(dicttup); + } + else + { + /* If OID doesn't match any pg_ts_dict row, return it numerically */ + result = (char *) palloc(NAMEDATALEN); + snprintf(result, NAMEDATALEN, "%u", dictid); + } + + PG_RETURN_CSTRING(result); +} + +/* + * regdictionaryrecv - converts external binary format to regdictionary + */ +Datum +regdictionaryrecv(PG_FUNCTION_ARGS) +{ + /* Exactly the same as oidrecv, so share code */ + return oidrecv(fcinfo); +} + +/* + * regdictionarysend - converts regdictionary to binary format + */ +Datum +regdictionarysend(PG_FUNCTION_ARGS) +{ + /* Exactly the same as oidsend, so share code */ + return oidsend(fcinfo); +} + + +/* * text_regclass: convert text to regclass * * This could be replaced by CoerceViaIO, except that we need to treat diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 1ba0b4eac48..60da08b1ec4 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -15,7 +15,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.234 2007/05/05 17:05:48 mha Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.235 2007/08/21 01:11:18 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -2822,6 +2822,8 @@ convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue, case REGOPERATOROID: case REGCLASSOID: case REGTYPEOID: + case REGCONFIGOID: + case REGDICTIONARYOID: *scaledvalue = convert_numeric_to_scalar(value, valuetypid); *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid); *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid); @@ -2925,6 +2927,8 @@ convert_numeric_to_scalar(Datum value, Oid typid) case REGOPERATOROID: case REGCLASSOID: case REGTYPEOID: + case REGCONFIGOID: + case REGDICTIONARYOID: /* we can treat OIDs as integers... */ return (double) DatumGetObjectId(value); } diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c new file mode 100644 index 00000000000..491dd21aa81 --- /dev/null +++ b/src/backend/utils/adt/tsginidx.c @@ -0,0 +1,157 @@ +/*------------------------------------------------------------------------- + * + * tsginidx.c + * GIN support functions for tsvector_ops + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsginidx.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/skey.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" + + +Datum +gin_extract_tsvector(PG_FUNCTION_ARGS) +{ + TSVector vector = PG_GETARG_TSVECTOR(0); + uint32 *nentries = (uint32 *) PG_GETARG_POINTER(1); + Datum *entries = NULL; + + *nentries = 0; + if (vector->size > 0) + { + int i; + WordEntry *we = ARRPTR(vector); + + *nentries = (uint32) vector->size; + entries = (Datum *) palloc(sizeof(Datum) * vector->size); + + for (i = 0; i < vector->size; i++) + { + text *txt = (text *) palloc(VARHDRSZ + we->len); + + SET_VARSIZE(txt, VARHDRSZ + we->len); + memcpy(VARDATA(txt), STRPTR(vector) + we->pos, we->len); + + entries[i] = PointerGetDatum(txt); + + we++; + } + } + + PG_FREE_IF_COPY(vector, 0); + PG_RETURN_POINTER(entries); +} + +Datum +gin_extract_query(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY(0); + uint32 *nentries = (uint32 *) PG_GETARG_POINTER(1); + StrategyNumber strategy = PG_GETARG_UINT16(2); + Datum *entries = NULL; + + *nentries = 0; + + if (query->size > 0) + { + int4 i, + j = 0, + len; + QueryItem *item; + + item = clean_NOT(GETQUERY(query), &len); + if (!item) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("query requires full scan, which is not supported by GIN indexes"))); + + item = GETQUERY(query); + + for (i = 0; i < query->size; i++) + if (item[i].type == VAL) + (*nentries)++; + + entries = (Datum *) palloc(sizeof(Datum) * (*nentries)); + + for (i = 0; i < query->size; i++) + if (item[i].type == VAL) + { + text *txt; + + txt = (text *) palloc(VARHDRSZ + item[i].length); + + SET_VARSIZE(txt, VARHDRSZ + item[i].length); + memcpy(VARDATA(txt), GETOPERAND(query) + item[i].distance, item[i].length); + + entries[j++] = PointerGetDatum(txt); + + if (strategy != TSearchWithClassStrategyNumber && item[i].weight != 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("@@ operator does not support lexeme class restrictions"), + errhint("Use the @@@ operator instead."))); + } + } + else + *nentries = -1; /* nothing can be found */ + + PG_FREE_IF_COPY(query, 0); + + PG_RETURN_POINTER(entries); +} + +typedef struct +{ + QueryItem *frst; + bool *mapped_check; +} GinChkVal; + +static bool +checkcondition_gin(void *checkval, QueryItem * val) +{ + GinChkVal *gcv = (GinChkVal *) checkval; + + return gcv->mapped_check[val - gcv->frst]; +} + +Datum +gin_ts_consistent(PG_FUNCTION_ARGS) +{ + bool *check = (bool *) PG_GETARG_POINTER(0); + /* StrategyNumber strategy = PG_GETARG_UINT16(1); */ + TSQuery query = PG_GETARG_TSQUERY(2); + bool res = FALSE; + + if (query->size > 0) + { + int4 i, + j = 0; + QueryItem *item; + GinChkVal gcv; + + gcv.frst = item = GETQUERY(query); + gcv.mapped_check = (bool *) palloc(sizeof(bool) * query->size); + + for (i = 0; i < query->size; i++) + if (item[i].type == VAL) + gcv.mapped_check[i] = check[j++]; + + res = TS_execute( + GETQUERY(query), + &gcv, + true, + checkcondition_gin + ); + } + + PG_RETURN_BOOL(res); +} diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c new file mode 100644 index 00000000000..04c2b4c442f --- /dev/null +++ b/src/backend/utils/adt/tsgistidx.c @@ -0,0 +1,784 @@ +/*------------------------------------------------------------------------- + * + * tsgistidx.c + * GiST support functions for tsvector_ops + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsgistidx.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gist.h" +#include "access/tuptoaster.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" +#include "utils/pg_crc.h" + + +#define SIGLENINT 31 /* >121 => key will toast, so it will not work + * !!! */ + +#define SIGLEN ( sizeof(int4) * SIGLENINT ) +#define SIGLENBIT (SIGLEN * BITS_PER_BYTE) + +typedef char BITVEC[SIGLEN]; +typedef char *BITVECP; + +#define LOOPBYTE(a) \ + for(i=0;i<SIGLEN;i++) {\ + a;\ +} + +#define GETBYTE(x,i) ( *( (BITVECP)(x) + (int)( (i) / BITS_PER_BYTE ) ) ) +#define GETBITBYTE(x,i) ( ((char)(x)) >> (i) & 0x01 ) +#define CLRBIT(x,i) GETBYTE(x,i) &= ~( 0x01 << ( (i) % BITS_PER_BYTE ) ) +#define SETBIT(x,i) GETBYTE(x,i) |= ( 0x01 << ( (i) % BITS_PER_BYTE ) ) +#define GETBIT(x,i) ( (GETBYTE(x,i) >> ( (i) % BITS_PER_BYTE )) & 0x01 ) + +#define HASHVAL(val) (((unsigned int)(val)) % SIGLENBIT) +#define HASH(sign, val) SETBIT((sign), HASHVAL(val)) + +#define GETENTRY(vec,pos) ((SignTSVector *) DatumGetPointer((vec)->vector[(pos)].key)) + +/* + * type of GiST index key + */ + +typedef struct +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ ; + int4 flag; + char data[1]; +} SignTSVector; + +#define ARRKEY 0x01 +#define SIGNKEY 0x02 +#define ALLISTRUE 0x04 + +#define ISARRKEY(x) ( ((SignTSVector*)(x))->flag & ARRKEY ) +#define ISSIGNKEY(x) ( ((SignTSVector*)(x))->flag & SIGNKEY ) +#define ISALLTRUE(x) ( ((SignTSVector*)(x))->flag & ALLISTRUE ) + +#define GTHDRSIZE ( VARHDRSZ + sizeof(int4) ) +#define CALCGTSIZE(flag, len) ( GTHDRSIZE + ( ( (flag) & ARRKEY ) ? ((len)*sizeof(int4)) : (((flag) & ALLISTRUE) ? 0 : SIGLEN) ) ) + +#define GETSIGN(x) ( (BITVECP)( (char*)(x)+GTHDRSIZE ) ) +#define GETARR(x) ( (int4*)( (char*)(x)+GTHDRSIZE ) ) +#define ARRNELEM(x) ( ( VARSIZE(x) - GTHDRSIZE )/sizeof(int4) ) + +/* Number of one-bits in an unsigned byte */ +static const uint8 number_of_ones[256] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 +}; + +static int4 sizebitvec(BITVECP sign); + +Datum +gtsvectorin(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("gtsvector_in not implemented"))); + PG_RETURN_DATUM(0); +} + +#define SINGOUTSTR "%d true bits, %d false bits" +#define ARROUTSTR "%d unique words" +#define EXTRALEN ( 2*13 ) + +static int outbuf_maxlen = 0; + +Datum +gtsvectorout(PG_FUNCTION_ARGS) +{ + SignTSVector *key = (SignTSVector *) DatumGetPointer(PG_DETOAST_DATUM(PG_GETARG_POINTER(0))); + char *outbuf; + + if (outbuf_maxlen == 0) + outbuf_maxlen = 2 * EXTRALEN + Max(strlen(SINGOUTSTR), strlen(ARROUTSTR)) + 1; + outbuf = palloc(outbuf_maxlen); + + if (ISARRKEY(key)) + sprintf(outbuf, ARROUTSTR, (int) ARRNELEM(key)); + else + { + int cnttrue = (ISALLTRUE(key)) ? SIGLENBIT : sizebitvec(GETSIGN(key)); + + sprintf(outbuf, SINGOUTSTR, cnttrue, (int) SIGLENBIT - cnttrue); + } + + PG_FREE_IF_COPY(key, 0); + PG_RETURN_POINTER(outbuf); +} + +static int +compareint(const void *a, const void *b) +{ + if (*((int4 *) a) == *((int4 *) b)) + return 0; + return (*((int4 *) a) > *((int4 *) b)) ? 1 : -1; +} + +static int +uniqueint(int4 *a, int4 l) +{ + int4 *ptr, + *res; + + if (l == 1) + return l; + + ptr = res = a; + + qsort((void *) a, l, sizeof(int4), compareint); + + while (ptr - a < l) + if (*ptr != *res) + *(++res) = *ptr++; + else + ptr++; + return res + 1 - a; +} + +static void +makesign(BITVECP sign, SignTSVector * a) +{ + int4 k, + len = ARRNELEM(a); + int4 *ptr = GETARR(a); + + MemSet((void *) sign, 0, sizeof(BITVEC)); + for (k = 0; k < len; k++) + HASH(sign, ptr[k]); +} + +Datum +gtsvector_compress(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + GISTENTRY *retval = entry; + + if (entry->leafkey) + { /* tsvector */ + SignTSVector *res; + TSVector val = DatumGetTSVector(entry->key); + int4 len; + int4 *arr; + WordEntry *ptr = ARRPTR(val); + char *words = STRPTR(val); + + len = CALCGTSIZE(ARRKEY, val->size); + res = (SignTSVector *) palloc(len); + SET_VARSIZE(res, len); + res->flag = ARRKEY; + arr = GETARR(res); + len = val->size; + while (len--) + { + pg_crc32 c; + + INIT_CRC32(c); + COMP_CRC32(c, words + ptr->pos, ptr->len); + FIN_CRC32(c); + + *arr = *(int4 *) &c; + arr++; + ptr++; + } + + len = uniqueint(GETARR(res), val->size); + if (len != val->size) + { + /* + * there is a collision of hash-function; len is always less than + * val->size + */ + len = CALCGTSIZE(ARRKEY, len); + res = (SignTSVector *) repalloc((void *) res, len); + SET_VARSIZE(res, len); + } + + /* make signature, if array is too long */ + if (VARSIZE(res) > TOAST_INDEX_TARGET) + { + SignTSVector *ressign; + + len = CALCGTSIZE(SIGNKEY, 0); + ressign = (SignTSVector *) palloc(len); + SET_VARSIZE(ressign, len); + ressign->flag = SIGNKEY; + makesign(GETSIGN(ressign), res); + res = ressign; + } + + retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + gistentryinit(*retval, PointerGetDatum(res), + entry->rel, entry->page, + entry->offset, FALSE); + } + else if (ISSIGNKEY(DatumGetPointer(entry->key)) && + !ISALLTRUE(DatumGetPointer(entry->key))) + { + int4 i, + len; + SignTSVector *res; + BITVECP sign = GETSIGN(DatumGetPointer(entry->key)); + + LOOPBYTE( + if ((sign[i] & 0xff) != 0xff) + PG_RETURN_POINTER(retval); + ); + + len = CALCGTSIZE(SIGNKEY | ALLISTRUE, 0); + res = (SignTSVector *) palloc(len); + SET_VARSIZE(res, len); + res->flag = SIGNKEY | ALLISTRUE; + + retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + gistentryinit(*retval, PointerGetDatum(res), + entry->rel, entry->page, + entry->offset, FALSE); + } + PG_RETURN_POINTER(retval); +} + +Datum +gtsvector_decompress(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + SignTSVector *key = (SignTSVector *) DatumGetPointer(PG_DETOAST_DATUM(entry->key)); + + if (key != (SignTSVector *) DatumGetPointer(entry->key)) + { + GISTENTRY *retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + + gistentryinit(*retval, PointerGetDatum(key), + entry->rel, entry->page, + entry->offset, FALSE); + + PG_RETURN_POINTER(retval); + } + + PG_RETURN_POINTER(entry); +} + +typedef struct +{ + int4 *arrb; + int4 *arre; +} CHKVAL; + +/* + * is there value 'val' in array or not ? + */ +static bool +checkcondition_arr(void *checkval, QueryItem * val) +{ + int4 *StopLow = ((CHKVAL *) checkval)->arrb; + int4 *StopHigh = ((CHKVAL *) checkval)->arre; + int4 *StopMiddle; + + /* Loop invariant: StopLow <= val < StopHigh */ + + while (StopLow < StopHigh) + { + StopMiddle = StopLow + (StopHigh - StopLow) / 2; + if (*StopMiddle == val->val) + return (true); + else if (*StopMiddle < val->val) + StopLow = StopMiddle + 1; + else + StopHigh = StopMiddle; + } + + return (false); +} + +static bool +checkcondition_bit(void *checkval, QueryItem * val) +{ + return GETBIT(checkval, HASHVAL(val->val)); +} + +Datum +gtsvector_consistent(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY(1); + SignTSVector *key = (SignTSVector *) DatumGetPointer( + ((GISTENTRY *) PG_GETARG_POINTER(0))->key + ); + + if (!query->size) + PG_RETURN_BOOL(false); + + if (ISSIGNKEY(key)) + { + if (ISALLTRUE(key)) + PG_RETURN_BOOL(true); + + PG_RETURN_BOOL(TS_execute( + GETQUERY(query), + (void *) GETSIGN(key), false, + checkcondition_bit + )); + } + else + { /* only leaf pages */ + CHKVAL chkval; + + chkval.arrb = GETARR(key); + chkval.arre = chkval.arrb + ARRNELEM(key); + PG_RETURN_BOOL(TS_execute( + GETQUERY(query), + (void *) &chkval, true, + checkcondition_arr + )); + } +} + +static int4 +unionkey(BITVECP sbase, SignTSVector * add) +{ + int4 i; + + if (ISSIGNKEY(add)) + { + BITVECP sadd = GETSIGN(add); + + if (ISALLTRUE(add)) + return 1; + + LOOPBYTE( + sbase[i] |= sadd[i]; + ); + } + else + { + int4 *ptr = GETARR(add); + + for (i = 0; i < ARRNELEM(add); i++) + HASH(sbase, ptr[i]); + } + return 0; +} + + +Datum +gtsvector_union(PG_FUNCTION_ARGS) +{ + GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); + int *size = (int *) PG_GETARG_POINTER(1); + BITVEC base; + int4 i, + len; + int4 flag = 0; + SignTSVector *result; + + MemSet((void *) base, 0, sizeof(BITVEC)); + for (i = 0; i < entryvec->n; i++) + { + if (unionkey(base, GETENTRY(entryvec, i))) + { + flag = ALLISTRUE; + break; + } + } + + flag |= SIGNKEY; + len = CALCGTSIZE(flag, 0); + result = (SignTSVector *) palloc(len); + *size = len; + SET_VARSIZE(result, len); + result->flag = flag; + if (!ISALLTRUE(result)) + memcpy((void *) GETSIGN(result), (void *) base, sizeof(BITVEC)); + + PG_RETURN_POINTER(result); +} + +Datum +gtsvector_same(PG_FUNCTION_ARGS) +{ + SignTSVector *a = (SignTSVector *) PG_GETARG_POINTER(0); + SignTSVector *b = (SignTSVector *) PG_GETARG_POINTER(1); + bool *result = (bool *) PG_GETARG_POINTER(2); + + if (ISSIGNKEY(a)) + { /* then b also ISSIGNKEY */ + if (ISALLTRUE(a) && ISALLTRUE(b)) + *result = true; + else if (ISALLTRUE(a)) + *result = false; + else if (ISALLTRUE(b)) + *result = false; + else + { + int4 i; + BITVECP sa = GETSIGN(a), + sb = GETSIGN(b); + + *result = true; + LOOPBYTE( + if (sa[i] != sb[i]) + { + *result = false; + break; + } + ); + } + } + else + { /* a and b ISARRKEY */ + int4 lena = ARRNELEM(a), + lenb = ARRNELEM(b); + + if (lena != lenb) + *result = false; + else + { + int4 *ptra = GETARR(a), + *ptrb = GETARR(b); + int4 i; + + *result = true; + for (i = 0; i < lena; i++) + if (ptra[i] != ptrb[i]) + { + *result = false; + break; + } + } + } + + PG_RETURN_POINTER(result); +} + +static int4 +sizebitvec(BITVECP sign) +{ + int4 size = 0, + i; + + LOOPBYTE( + size += number_of_ones[(unsigned char) sign[i]]; + ); + return size; +} + +static int +hemdistsign(BITVECP a, BITVECP b) +{ + int i, + diff, + dist = 0; + + LOOPBYTE( + diff = (unsigned char) (a[i] ^ b[i]); + dist += number_of_ones[diff]; + ); + return dist; +} + +static int +hemdist(SignTSVector * a, SignTSVector * b) +{ + if (ISALLTRUE(a)) + { + if (ISALLTRUE(b)) + return 0; + else + return SIGLENBIT - sizebitvec(GETSIGN(b)); + } + else if (ISALLTRUE(b)) + return SIGLENBIT - sizebitvec(GETSIGN(a)); + + return hemdistsign(GETSIGN(a), GETSIGN(b)); +} + +Datum +gtsvector_penalty(PG_FUNCTION_ARGS) +{ + GISTENTRY *origentry = (GISTENTRY *) PG_GETARG_POINTER(0); /* always ISSIGNKEY */ + GISTENTRY *newentry = (GISTENTRY *) PG_GETARG_POINTER(1); + float *penalty = (float *) PG_GETARG_POINTER(2); + SignTSVector *origval = (SignTSVector *) DatumGetPointer(origentry->key); + SignTSVector *newval = (SignTSVector *) DatumGetPointer(newentry->key); + BITVECP orig = GETSIGN(origval); + + *penalty = 0.0; + + if (ISARRKEY(newval)) + { + BITVEC sign; + + makesign(sign, newval); + + if (ISALLTRUE(origval)) + *penalty = ((float) (SIGLENBIT - sizebitvec(sign))) / (float) (SIGLENBIT + 1); + else + *penalty = hemdistsign(sign, orig); + } + else + *penalty = hemdist(origval, newval); + PG_RETURN_POINTER(penalty); +} + +typedef struct +{ + bool allistrue; + BITVEC sign; +} CACHESIGN; + +static void +fillcache(CACHESIGN * item, SignTSVector * key) +{ + item->allistrue = false; + if (ISARRKEY(key)) + makesign(item->sign, key); + else if (ISALLTRUE(key)) + item->allistrue = true; + else + memcpy((void *) item->sign, (void *) GETSIGN(key), sizeof(BITVEC)); +} + +#define WISH_F(a,b,c) (double)( -(double)(((a)-(b))*((a)-(b))*((a)-(b)))*(c) ) +typedef struct +{ + OffsetNumber pos; + int4 cost; +} SPLITCOST; + +static int +comparecost(const void *a, const void *b) +{ + if (((SPLITCOST *) a)->cost == ((SPLITCOST *) b)->cost) + return 0; + else + return (((SPLITCOST *) a)->cost > ((SPLITCOST *) b)->cost) ? 1 : -1; +} + + +static int +hemdistcache(CACHESIGN * a, CACHESIGN * b) +{ + if (a->allistrue) + { + if (b->allistrue) + return 0; + else + return SIGLENBIT - sizebitvec(b->sign); + } + else if (b->allistrue) + return SIGLENBIT - sizebitvec(a->sign); + + return hemdistsign(a->sign, b->sign); +} + +Datum +gtsvector_picksplit(PG_FUNCTION_ARGS) +{ + GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); + GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1); + OffsetNumber k, + j; + SignTSVector *datum_l, + *datum_r; + BITVECP union_l, + union_r; + int4 size_alpha, + size_beta; + int4 size_waste, + waste = -1; + int4 nbytes; + OffsetNumber seed_1 = 0, + seed_2 = 0; + OffsetNumber *left, + *right; + OffsetNumber maxoff; + BITVECP ptr; + int i; + CACHESIGN *cache; + SPLITCOST *costvector; + + maxoff = entryvec->n - 2; + nbytes = (maxoff + 2) * sizeof(OffsetNumber); + v->spl_left = (OffsetNumber *) palloc(nbytes); + v->spl_right = (OffsetNumber *) palloc(nbytes); + + cache = (CACHESIGN *) palloc(sizeof(CACHESIGN) * (maxoff + 2)); + fillcache(&cache[FirstOffsetNumber], GETENTRY(entryvec, FirstOffsetNumber)); + + for (k = FirstOffsetNumber; k < maxoff; k = OffsetNumberNext(k)) + { + for (j = OffsetNumberNext(k); j <= maxoff; j = OffsetNumberNext(j)) + { + if (k == FirstOffsetNumber) + fillcache(&cache[j], GETENTRY(entryvec, j)); + + size_waste = hemdistcache(&(cache[j]), &(cache[k])); + if (size_waste > waste) + { + waste = size_waste; + seed_1 = k; + seed_2 = j; + } + } + } + + left = v->spl_left; + v->spl_nleft = 0; + right = v->spl_right; + v->spl_nright = 0; + + if (seed_1 == 0 || seed_2 == 0) + { + seed_1 = 1; + seed_2 = 2; + } + + /* form initial .. */ + if (cache[seed_1].allistrue) + { + datum_l = (SignTSVector *) palloc(CALCGTSIZE(SIGNKEY | ALLISTRUE, 0)); + SET_VARSIZE(datum_l, CALCGTSIZE(SIGNKEY | ALLISTRUE, 0)); + datum_l->flag = SIGNKEY | ALLISTRUE; + } + else + { + datum_l = (SignTSVector *) palloc(CALCGTSIZE(SIGNKEY, 0)); + SET_VARSIZE(datum_l, CALCGTSIZE(SIGNKEY, 0)); + datum_l->flag = SIGNKEY; + memcpy((void *) GETSIGN(datum_l), (void *) cache[seed_1].sign, sizeof(BITVEC)); + } + if (cache[seed_2].allistrue) + { + datum_r = (SignTSVector *) palloc(CALCGTSIZE(SIGNKEY | ALLISTRUE, 0)); + SET_VARSIZE(datum_r, CALCGTSIZE(SIGNKEY | ALLISTRUE, 0)); + datum_r->flag = SIGNKEY | ALLISTRUE; + } + else + { + datum_r = (SignTSVector *) palloc(CALCGTSIZE(SIGNKEY, 0)); + SET_VARSIZE(datum_r, CALCGTSIZE(SIGNKEY, 0)); + datum_r->flag = SIGNKEY; + memcpy((void *) GETSIGN(datum_r), (void *) cache[seed_2].sign, sizeof(BITVEC)); + } + + union_l = GETSIGN(datum_l); + union_r = GETSIGN(datum_r); + maxoff = OffsetNumberNext(maxoff); + fillcache(&cache[maxoff], GETENTRY(entryvec, maxoff)); + /* sort before ... */ + costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff); + for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j)) + { + costvector[j - 1].pos = j; + size_alpha = hemdistcache(&(cache[seed_1]), &(cache[j])); + size_beta = hemdistcache(&(cache[seed_2]), &(cache[j])); + costvector[j - 1].cost = Abs(size_alpha - size_beta); + } + qsort((void *) costvector, maxoff, sizeof(SPLITCOST), comparecost); + + for (k = 0; k < maxoff; k++) + { + j = costvector[k].pos; + if (j == seed_1) + { + *left++ = j; + v->spl_nleft++; + continue; + } + else if (j == seed_2) + { + *right++ = j; + v->spl_nright++; + continue; + } + + if (ISALLTRUE(datum_l) || cache[j].allistrue) + { + if (ISALLTRUE(datum_l) && cache[j].allistrue) + size_alpha = 0; + else + size_alpha = SIGLENBIT - sizebitvec( + (cache[j].allistrue) ? GETSIGN(datum_l) : GETSIGN(cache[j].sign) + ); + } + else + size_alpha = hemdistsign(cache[j].sign, GETSIGN(datum_l)); + + if (ISALLTRUE(datum_r) || cache[j].allistrue) + { + if (ISALLTRUE(datum_r) && cache[j].allistrue) + size_beta = 0; + else + size_beta = SIGLENBIT - sizebitvec( + (cache[j].allistrue) ? GETSIGN(datum_r) : GETSIGN(cache[j].sign) + ); + } + else + size_beta = hemdistsign(cache[j].sign, GETSIGN(datum_r)); + + if (size_alpha < size_beta + WISH_F(v->spl_nleft, v->spl_nright, 0.1)) + { + if (ISALLTRUE(datum_l) || cache[j].allistrue) + { + if (!ISALLTRUE(datum_l)) + MemSet((void *) GETSIGN(datum_l), 0xff, sizeof(BITVEC)); + } + else + { + ptr = cache[j].sign; + LOOPBYTE( + union_l[i] |= ptr[i]; + ); + } + *left++ = j; + v->spl_nleft++; + } + else + { + if (ISALLTRUE(datum_r) || cache[j].allistrue) + { + if (!ISALLTRUE(datum_r)) + MemSet((void *) GETSIGN(datum_r), 0xff, sizeof(BITVEC)); + } + else + { + ptr = cache[j].sign; + LOOPBYTE( + union_r[i] |= ptr[i]; + ); + } + *right++ = j; + v->spl_nright++; + } + } + + *right = *left = FirstOffsetNumber; + v->spl_ldatum = PointerGetDatum(datum_l); + v->spl_rdatum = PointerGetDatum(datum_r); + + PG_RETURN_POINTER(v); +} diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c new file mode 100644 index 00000000000..1f8abb3298e --- /dev/null +++ b/src/backend/utils/adt/tsquery.c @@ -0,0 +1,767 @@ +/*------------------------------------------------------------------------- + * + * tsquery.c + * I/O functions for tsquery + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "libpq/pqformat.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" +#include "utils/memutils.h" +#include "utils/pg_crc.h" + +/* parser's states */ +#define WAITOPERAND 1 +#define WAITOPERATOR 2 +#define WAITFIRSTOPERAND 3 +#define WAITSINGLEOPERAND 4 + +/* + * node of query tree, also used + * for storing polish notation in parser + */ +typedef struct ParseQueryNode +{ + int2 weight; + int2 type; + int4 val; + int2 distance; + int2 length; + struct ParseQueryNode *next; +} ParseQueryNode; + +static char * +get_weight(char *buf, int2 *weight) +{ + *weight = 0; + + if (!t_iseq(buf, ':')) + return buf; + + buf++; + while (*buf && pg_mblen(buf) == 1) + { + switch (*buf) + { + case 'a': + case 'A': + *weight |= 1 << 3; + break; + case 'b': + case 'B': + *weight |= 1 << 2; + break; + case 'c': + case 'C': + *weight |= 1 << 1; + break; + case 'd': + case 'D': + *weight |= 1; + break; + default: + return buf; + } + buf++; + } + + return buf; +} + +/* + * get token from query string + */ +static int4 +gettoken_query(TSQueryParserState * state, int4 *val, int4 *lenval, char **strval, int2 *weight) +{ + while (1) + { + switch (state->state) + { + case WAITFIRSTOPERAND: + case WAITOPERAND: + if (t_iseq(state->buf, '!')) + { + (state->buf)++; /* can safely ++, t_iseq guarantee + * that pg_mblen()==1 */ + *val = (int4) '!'; + state->state = WAITOPERAND; + return OPR; + } + else if (t_iseq(state->buf, '(')) + { + state->count++; + (state->buf)++; + state->state = WAITOPERAND; + return OPEN; + } + else if (t_iseq(state->buf, ':')) + { + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error at start of operand in tsearch query: \"%s\"", + state->buffer))); + } + else if (!t_isspace(state->buf)) + { + state->valstate.prsbuf = state->buf; + if (gettoken_tsvector(&(state->valstate))) + { + *strval = state->valstate.word; + *lenval = state->valstate.curpos - state->valstate.word; + state->buf = get_weight(state->valstate.prsbuf, weight); + state->state = WAITOPERATOR; + return VAL; + } + else if (state->state == WAITFIRSTOPERAND) + return END; + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("no operand in tsearch query: \"%s\"", + state->buffer))); + } + break; + case WAITOPERATOR: + if (t_iseq(state->buf, '&') || t_iseq(state->buf, '|')) + { + state->state = WAITOPERAND; + *val = (int4) *(state->buf); + (state->buf)++; + return OPR; + } + else if (t_iseq(state->buf, ')')) + { + (state->buf)++; + state->count--; + return (state->count < 0) ? ERR : CLOSE; + } + else if (*(state->buf) == '\0') + return (state->count) ? ERR : END; + else if (!t_isspace(state->buf)) + return ERR; + break; + case WAITSINGLEOPERAND: + if (*(state->buf) == '\0') + return END; + *strval = state->buf; + *lenval = strlen(state->buf); + state->buf += strlen(state->buf); + state->count++; + return VAL; + default: + return ERR; + break; + } + state->buf += pg_mblen(state->buf); + } + return END; +} + +/* + * push new one in polish notation reverse view + */ +void +pushquery(TSQueryParserState * state, int4 type, int4 val, int4 distance, int4 lenval, int2 weight) +{ + ParseQueryNode *tmp = (ParseQueryNode *) palloc(sizeof(ParseQueryNode)); + + tmp->weight = weight; + tmp->type = type; + tmp->val = val; + if (distance >= MAXSTRPOS) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("value is too big in tsearch query: \"%s\"", + state->buffer))); + if (lenval >= MAXSTRLEN) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("operand is too long in tsearch query: \"%s\"", + state->buffer))); + tmp->distance = distance; + tmp->length = lenval; + tmp->next = state->str; + state->str = tmp; + state->num++; +} + +/* + * This function is used for tsquery parsing + */ +void +pushval_asis(TSQueryParserState * state, int type, char *strval, int lenval, int2 weight) +{ + pg_crc32 c; + + if (lenval >= MAXSTRLEN) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("word is too long in tsearch query: \"%s\"", + state->buffer))); + + INIT_CRC32(c); + COMP_CRC32(c, strval, lenval); + FIN_CRC32(c); + pushquery(state, type, *(int4 *) &c, + state->curop - state->op, lenval, weight); + + while (state->curop - state->op + lenval + 1 >= state->lenop) + { + int4 tmp = state->curop - state->op; + + state->lenop *= 2; + state->op = (char *) repalloc((void *) state->op, state->lenop); + state->curop = state->op + tmp; + } + memcpy((void *) state->curop, (void *) strval, lenval); + state->curop += lenval; + *(state->curop) = '\0'; + state->curop++; + state->sumlen += lenval + 1 /* \0 */ ; + return; +} + +#define STACKDEPTH 32 +/* + * make polish notation of query + */ +static int4 +makepol(TSQueryParserState * state, void (*pushval) (TSQueryParserState *, int, char *, int, int2)) +{ + int4 val = 0, + type; + int4 lenval = 0; + char *strval = NULL; + int4 stack[STACKDEPTH]; + int4 lenstack = 0; + int2 weight = 0; + + while ((type = gettoken_query(state, &val, &lenval, &strval, &weight)) != END) + { + switch (type) + { + case VAL: + pushval(state, VAL, strval, lenval, weight); + while (lenstack && (stack[lenstack - 1] == (int4) '&' || + stack[lenstack - 1] == (int4) '!')) + { + lenstack--; + pushquery(state, OPR, stack[lenstack], 0, 0, 0); + } + break; + case OPR: + if (lenstack && val == (int4) '|') + pushquery(state, OPR, val, 0, 0, 0); + else + { + if (lenstack == STACKDEPTH) /* internal error */ + elog(ERROR, "tsquery stack too small"); + stack[lenstack] = val; + lenstack++; + } + break; + case OPEN: + if (makepol(state, pushval) == ERR) + return ERR; + if (lenstack && (stack[lenstack - 1] == (int4) '&' || + stack[lenstack - 1] == (int4) '!')) + { + lenstack--; + pushquery(state, OPR, stack[lenstack], 0, 0, 0); + } + break; + case CLOSE: + while (lenstack) + { + lenstack--; + pushquery(state, OPR, stack[lenstack], 0, 0, 0); + }; + return END; + break; + case ERR: + default: + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsearch query: \"%s\"", + state->buffer))); + return ERR; + + } + } + while (lenstack) + { + lenstack--; + pushquery(state, OPR, stack[lenstack], 0, 0, 0); + }; + return END; +} + +static void +findoprnd(QueryItem * ptr, int4 *pos) +{ + if (ptr[*pos].type == VAL || ptr[*pos].type == VALSTOP) + { + ptr[*pos].left = 0; + (*pos)++; + } + else if (ptr[*pos].val == (int4) '!') + { + ptr[*pos].left = 1; + (*pos)++; + findoprnd(ptr, pos); + } + else + { + QueryItem *curitem = &ptr[*pos]; + int4 tmp = *pos; + + (*pos)++; + findoprnd(ptr, pos); + curitem->left = *pos - tmp; + findoprnd(ptr, pos); + } +} + + +/* + * input + */ +TSQuery +parse_tsquery(char *buf, void (*pushval) (TSQueryParserState *, int, char *, int, int2), Oid cfg_id, bool isplain) +{ + TSQueryParserState state; + int4 i; + TSQuery query; + int4 commonlen; + QueryItem *ptr; + ParseQueryNode *tmp; + int4 pos = 0; + + /* init state */ + state.buffer = buf; + state.buf = buf; + state.state = (isplain) ? WAITSINGLEOPERAND : WAITFIRSTOPERAND; + state.count = 0; + state.num = 0; + state.str = NULL; + state.cfg_id = cfg_id; + + /* init value parser's state */ + state.valstate.oprisdelim = true; + state.valstate.len = 32; + state.valstate.word = (char *) palloc(state.valstate.len); + + /* init list of operand */ + state.sumlen = 0; + state.lenop = 64; + state.curop = state.op = (char *) palloc(state.lenop); + *(state.curop) = '\0'; + + /* parse query & make polish notation (postfix, but in reverse order) */ + makepol(&state, pushval); + pfree(state.valstate.word); + if (!state.num) + { + ereport(NOTICE, + (errmsg("tsearch query doesn't contain lexeme(s): \"%s\"", + state.buffer))); + query = (TSQuery) palloc(HDRSIZETQ); + SET_VARSIZE(query, HDRSIZETQ); + query->size = 0; + return query; + } + + /* make finish struct */ + commonlen = COMPUTESIZE(state.num, state.sumlen); + query = (TSQuery) palloc(commonlen); + SET_VARSIZE(query, commonlen); + query->size = state.num; + ptr = GETQUERY(query); + + /* set item in polish notation */ + for (i = 0; i < state.num; i++) + { + ptr[i].weight = state.str->weight; + ptr[i].type = state.str->type; + ptr[i].val = state.str->val; + ptr[i].distance = state.str->distance; + ptr[i].length = state.str->length; + tmp = state.str->next; + pfree(state.str); + state.str = tmp; + } + + /* set user friendly-operand view */ + memcpy((void *) GETOPERAND(query), (void *) state.op, state.sumlen); + pfree(state.op); + + /* set left operand's position for every operator */ + pos = 0; + findoprnd(ptr, &pos); + + return query; +} + +/* + * in without morphology + */ +Datum +tsqueryin(PG_FUNCTION_ARGS) +{ + char *in = PG_GETARG_CSTRING(0); + + pg_verifymbstr(in, strlen(in), false); + + PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, InvalidOid, false)); +} + +/* + * out function + */ +typedef struct +{ + QueryItem *curpol; + char *buf; + char *cur; + char *op; + int4 buflen; +} INFIX; + +#define RESIZEBUF(inf,addsize) \ +while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \ +{ \ + int4 len = (inf)->cur - (inf)->buf; \ + (inf)->buflen *= 2; \ + (inf)->buf = (char*) repalloc( (void*)(inf)->buf, (inf)->buflen ); \ + (inf)->cur = (inf)->buf + len; \ +} + +/* + * recursive walk on tree and print it in + * infix (human-readable) view + */ +static void +infix(INFIX * in, bool first) +{ + if (in->curpol->type == VAL) + { + char *op = in->op + in->curpol->distance; + int clen; + + RESIZEBUF(in, in->curpol->length * (pg_database_encoding_max_length() + 1) + 2 + 5); + *(in->cur) = '\''; + in->cur++; + while (*op) + { + if (t_iseq(op, '\'')) + { + *(in->cur) = '\''; + in->cur++; + } + COPYCHAR(in->cur, op); + + clen = pg_mblen(op); + op += clen; + in->cur += clen; + } + *(in->cur) = '\''; + in->cur++; + if (in->curpol->weight) + { + *(in->cur) = ':'; + in->cur++; + if (in->curpol->weight & (1 << 3)) + { + *(in->cur) = 'A'; + in->cur++; + } + if (in->curpol->weight & (1 << 2)) + { + *(in->cur) = 'B'; + in->cur++; + } + if (in->curpol->weight & (1 << 1)) + { + *(in->cur) = 'C'; + in->cur++; + } + if (in->curpol->weight & 1) + { + *(in->cur) = 'D'; + in->cur++; + } + } + *(in->cur) = '\0'; + in->curpol++; + } + else if (in->curpol->val == (int4) '!') + { + bool isopr = false; + + RESIZEBUF(in, 1); + *(in->cur) = '!'; + in->cur++; + *(in->cur) = '\0'; + in->curpol++; + if (in->curpol->type == OPR) + { + isopr = true; + RESIZEBUF(in, 2); + sprintf(in->cur, "( "); + in->cur = strchr(in->cur, '\0'); + } + infix(in, isopr); + if (isopr) + { + RESIZEBUF(in, 2); + sprintf(in->cur, " )"); + in->cur = strchr(in->cur, '\0'); + } + } + else + { + int4 op = in->curpol->val; + INFIX nrm; + + in->curpol++; + if (op == (int4) '|' && !first) + { + RESIZEBUF(in, 2); + sprintf(in->cur, "( "); + in->cur = strchr(in->cur, '\0'); + } + + nrm.curpol = in->curpol; + nrm.op = in->op; + nrm.buflen = 16; + nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); + + /* get right operand */ + infix(&nrm, false); + + /* get & print left operand */ + in->curpol = nrm.curpol; + infix(in, false); + + /* print operator & right operand */ + RESIZEBUF(in, 3 + (nrm.cur - nrm.buf)); + sprintf(in->cur, " %c %s", op, nrm.buf); + in->cur = strchr(in->cur, '\0'); + pfree(nrm.buf); + + if (op == (int4) '|' && !first) + { + RESIZEBUF(in, 2); + sprintf(in->cur, " )"); + in->cur = strchr(in->cur, '\0'); + } + } +} + + +Datum +tsqueryout(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY(0); + INFIX nrm; + + if (query->size == 0) + { + char *b = palloc(1); + + *b = '\0'; + PG_RETURN_POINTER(b); + } + nrm.curpol = GETQUERY(query); + nrm.buflen = 32; + nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); + *(nrm.cur) = '\0'; + nrm.op = GETOPERAND(query); + infix(&nrm, true); + + PG_FREE_IF_COPY(query, 0); + PG_RETURN_CSTRING(nrm.buf); +} + +Datum +tsquerysend(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY(0); + StringInfoData buf; + int i; + QueryItem *item = GETQUERY(query); + + pq_begintypsend(&buf); + + pq_sendint(&buf, query->size, sizeof(int32)); + for (i = 0; i < query->size; i++) + { + int tmp; + + pq_sendint(&buf, item->type, sizeof(item->type)); + pq_sendint(&buf, item->weight, sizeof(item->weight)); + pq_sendint(&buf, item->left, sizeof(item->left)); + pq_sendint(&buf, item->val, sizeof(item->val)); + + /* + * We are sure that sizeof(WordEntry) == sizeof(int32), and about + * layout of QueryItem + */ + tmp = *(int32 *) (((char *) item) + HDRSIZEQI); + pq_sendint(&buf, tmp, sizeof(tmp)); + + item++; + } + + item = GETQUERY(query); + for (i = 0; i < query->size; i++) + { + if (item->type == VAL) + pq_sendbytes(&buf, GETOPERAND(query) + item->distance, item->length); + item++; + } + + PG_FREE_IF_COPY(query, 0); + + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); +} + +Datum +tsqueryrecv(PG_FUNCTION_ARGS) +{ + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + TSQuery query; + int i, + size, + tmp, + len = HDRSIZETQ; + QueryItem *item; + int datalen = 0; + char *ptr; + + size = pq_getmsgint(buf, sizeof(uint32)); + if (size < 0 || size > (MaxAllocSize / sizeof(QueryItem))) + elog(ERROR, "invalid size of tsquery"); + len += sizeof(QueryItem) * size; + + query = (TSQuery) palloc(len); + query->size = size; + item = GETQUERY(query); + + for (i = 0; i < size; i++) + { + item->type = (int8) pq_getmsgint(buf, sizeof(int8)); + item->weight = (int8) pq_getmsgint(buf, sizeof(int8)); + item->left = (int16) pq_getmsgint(buf, sizeof(int16)); + item->val = (int32) pq_getmsgint(buf, sizeof(int32)); + tmp = pq_getmsgint(buf, sizeof(int32)); + memcpy((((char *) item) + HDRSIZEQI), &tmp, sizeof(int32)); + + /* + * Sanity checks + */ + if (item->type == VAL) + { + datalen += item->length + 1; /* \0 */ + } + else if (item->type == OPR) + { + if (item->val == '|' || item->val == '&') + { + if (item->left <= 0 || i + item->left >= size) + elog(ERROR, "invalid pointer to left operand"); + } + + if (i == size - 1) + elog(ERROR, "invalid pointer to right operand"); + } + else + elog(ERROR, "unknown tsquery node type"); + + item++; + } + + query = (TSQuery) repalloc(query, len + datalen); + + item = GETQUERY(query); + ptr = GETOPERAND(query); + for (i = 0; i < size; i++) + { + if (item->type == VAL) + { + item->distance = ptr - GETOPERAND(query); + memcpy(ptr, + pq_getmsgbytes(buf, item->length), + item->length); + ptr += item->length; + *ptr++ = '\0'; + } + item++; + } + + Assert(ptr - GETOPERAND(query) == datalen); + + SET_VARSIZE(query, len + datalen); + + PG_RETURN_TSVECTOR(query); +} + +/* + * debug function, used only for view query + * which will be executed in non-leaf pages in index + */ +Datum +tsquerytree(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY(0); + INFIX nrm; + text *res; + QueryItem *q; + int4 len; + + if (query->size == 0) + { + res = (text *) palloc(VARHDRSZ); + SET_VARSIZE(res, VARHDRSZ); + PG_RETURN_POINTER(res); + } + + q = clean_NOT(GETQUERY(query), &len); + + if (!q) + { + res = (text *) palloc(1 + VARHDRSZ); + SET_VARSIZE(res, 1 + VARHDRSZ); + *((char *) VARDATA(res)) = 'T'; + } + else + { + nrm.curpol = q; + nrm.buflen = 32; + nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); + *(nrm.cur) = '\0'; + nrm.op = GETOPERAND(query); + infix(&nrm, true); + + res = (text *) palloc(nrm.cur - nrm.buf + VARHDRSZ); + SET_VARSIZE(res, nrm.cur - nrm.buf + VARHDRSZ); + strncpy(VARDATA(res), nrm.buf, nrm.cur - nrm.buf); + pfree(q); + } + + PG_FREE_IF_COPY(query, 0); + + PG_RETURN_POINTER(res); +} diff --git a/src/backend/utils/adt/tsquery_cleanup.c b/src/backend/utils/adt/tsquery_cleanup.c new file mode 100644 index 00000000000..7991a4ad198 --- /dev/null +++ b/src/backend/utils/adt/tsquery_cleanup.c @@ -0,0 +1,261 @@ +/*------------------------------------------------------------------------- + * + * tsquery_cleanup.c + * Cleanup query from NOT values and/or stopword + * Utility functions to correct work. + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_cleanup.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" + +typedef struct NODE +{ + struct NODE *left; + struct NODE *right; + QueryItem *valnode; +} NODE; + +/* + * make query tree from plain view of query + */ +static NODE * +maketree(QueryItem * in) +{ + NODE *node = (NODE *) palloc(sizeof(NODE)); + + node->valnode = in; + node->right = node->left = NULL; + if (in->type == OPR) + { + node->right = maketree(in + 1); + if (in->val != (int4) '!') + node->left = maketree(in + in->left); + } + return node; +} + +typedef struct +{ + QueryItem *ptr; + int4 len; + int4 cur; +} PLAINTREE; + +static void +plainnode(PLAINTREE * state, NODE * node) +{ + if (state->cur == state->len) + { + state->len *= 2; + state->ptr = (QueryItem *) repalloc((void *) state->ptr, state->len * sizeof(QueryItem)); + } + memcpy((void *) &(state->ptr[state->cur]), (void *) node->valnode, sizeof(QueryItem)); + if (node->valnode->type == VAL) + state->cur++; + else if (node->valnode->val == (int4) '!') + { + state->ptr[state->cur].left = 1; + state->cur++; + plainnode(state, node->right); + } + else + { + int4 cur = state->cur; + + state->cur++; + plainnode(state, node->right); + state->ptr[cur].left = state->cur - cur; + plainnode(state, node->left); + } + pfree(node); +} + +/* + * make plain view of tree from 'normal' view of tree + */ +static QueryItem * +plaintree(NODE * root, int4 *len) +{ + PLAINTREE pl; + + pl.cur = 0; + pl.len = 16; + if (root && (root->valnode->type == VAL || root->valnode->type == OPR)) + { + pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem)); + plainnode(&pl, root); + } + else + pl.ptr = NULL; + *len = pl.cur; + return pl.ptr; +} + +static void +freetree(NODE * node) +{ + if (!node) + return; + if (node->left) + freetree(node->left); + if (node->right) + freetree(node->right); + pfree(node); +} + +/* + * clean tree for ! operator. + * It's usefull for debug, but in + * other case, such view is used with search in index. + * Operator ! always return TRUE + */ +static NODE * +clean_NOT_intree(NODE * node) +{ + if (node->valnode->type == VAL) + return node; + + if (node->valnode->val == (int4) '!') + { + freetree(node); + return NULL; + } + + /* operator & or | */ + if (node->valnode->val == (int4) '|') + { + if ((node->left = clean_NOT_intree(node->left)) == NULL || + (node->right = clean_NOT_intree(node->right)) == NULL) + { + freetree(node); + return NULL; + } + } + else + { + NODE *res = node; + + node->left = clean_NOT_intree(node->left); + node->right = clean_NOT_intree(node->right); + if (node->left == NULL && node->right == NULL) + { + pfree(node); + res = NULL; + } + else if (node->left == NULL) + { + res = node->right; + pfree(node); + } + else if (node->right == NULL) + { + res = node->left; + pfree(node); + } + return res; + } + return node; +} + +QueryItem * +clean_NOT(QueryItem * ptr, int4 *len) +{ + NODE *root = maketree(ptr); + + return plaintree(clean_NOT_intree(root), len); +} + + +#ifdef V_UNKNOWN /* exists in Windows headers */ +#undef V_UNKNOWN +#endif + +#define V_UNKNOWN 0 +#define V_TRUE 1 +#define V_FALSE 2 +#define V_STOP 3 + +/* + * Clean query tree from values which is always in + * text (stopword) + */ +static NODE * +clean_fakeval_intree(NODE * node, char *result) +{ + char lresult = V_UNKNOWN, + rresult = V_UNKNOWN; + + if (node->valnode->type == VAL) + return node; + else if (node->valnode->type == VALSTOP) + { + pfree(node); + *result = V_STOP; + return NULL; + } + + + if (node->valnode->val == (int4) '!') + { + node->right = clean_fakeval_intree(node->right, &rresult); + if (!node->right) + { + *result = V_STOP; + freetree(node); + return NULL; + } + } + else + { + NODE *res = node; + + node->left = clean_fakeval_intree(node->left, &lresult); + node->right = clean_fakeval_intree(node->right, &rresult); + if (lresult == V_STOP && rresult == V_STOP) + { + freetree(node); + *result = V_STOP; + return NULL; + } + else if (lresult == V_STOP) + { + res = node->right; + pfree(node); + } + else if (rresult == V_STOP) + { + res = node->left; + pfree(node); + } + return res; + } + return node; +} + +QueryItem * +clean_fakeval(QueryItem * ptr, int4 *len) +{ + NODE *root = maketree(ptr); + char result = V_UNKNOWN; + NODE *resroot; + + resroot = clean_fakeval_intree(root, &result); + if (result != V_UNKNOWN) + { + elog(NOTICE, "query contains only stopword(s) or doesn't contain lexeme(s), ignored"); + *len = 0; + return NULL; + } + + return plaintree(resroot, len); +} diff --git a/src/backend/utils/adt/tsquery_gist.c b/src/backend/utils/adt/tsquery_gist.c new file mode 100644 index 00000000000..0deca10075c --- /dev/null +++ b/src/backend/utils/adt/tsquery_gist.c @@ -0,0 +1,259 @@ +/*------------------------------------------------------------------------- + * + * tsquery_gist.c + * GiST index support for tsquery + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_gist.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/skey.h" +#include "access/gist.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" + +#define GETENTRY(vec,pos) ((TSQuerySign *) DatumGetPointer((vec)->vector[(pos)].key)) + +Datum +gtsquery_compress(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + GISTENTRY *retval = entry; + + if (entry->leafkey) + { + TSQuerySign *sign = (TSQuerySign *) palloc(sizeof(TSQuerySign)); + + retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + *sign = makeTSQuerySign(DatumGetTSQuery(entry->key)); + + gistentryinit(*retval, PointerGetDatum(sign), + entry->rel, entry->page, + entry->offset, FALSE); + } + + PG_RETURN_POINTER(retval); +} + +Datum +gtsquery_decompress(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(PG_GETARG_DATUM(0)); +} + +Datum +gtsquery_consistent(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + TSQuerySign *key = (TSQuerySign *) DatumGetPointer(entry->key); + TSQuery query = PG_GETARG_TSQUERY(1); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + TSQuerySign sq = makeTSQuerySign(query); + bool retval; + + switch (strategy) + { + case RTContainsStrategyNumber: + if (GIST_LEAF(entry)) + retval = (*key & sq) == sq; + else + retval = (*key & sq) != 0; + break; + case RTContainedByStrategyNumber: + if (GIST_LEAF(entry)) + retval = (*key & sq) == *key; + else + retval = (*key & sq) != 0; + break; + default: + retval = FALSE; + } + PG_RETURN_BOOL(retval); +} + +Datum +gtsquery_union(PG_FUNCTION_ARGS) +{ + GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); + int *size = (int *) PG_GETARG_POINTER(1); + TSQuerySign *sign = (TSQuerySign *) palloc(sizeof(TSQuerySign)); + int i; + + memset(sign, 0, sizeof(TSQuerySign)); + + for (i = 0; i < entryvec->n; i++) + *sign |= *GETENTRY(entryvec, i); + + *size = sizeof(TSQuerySign); + + PG_RETURN_POINTER(sign); +} + +Datum +gtsquery_same(PG_FUNCTION_ARGS) +{ + TSQuerySign *a = (TSQuerySign *) PG_GETARG_POINTER(0); + TSQuerySign *b = (TSQuerySign *) PG_GETARG_POINTER(1); + + PG_RETURN_POINTER(*a == *b); +} + +static int +sizebitvec(TSQuerySign sign) +{ + int size = 0, + i; + + for (i = 0; i < TSQS_SIGLEN; i++) + size += 0x01 & (sign >> i); + + return size; +} + +static int +hemdist(TSQuerySign a, TSQuerySign b) +{ + TSQuerySign res = a ^ b; + + return sizebitvec(res); +} + +Datum +gtsquery_penalty(PG_FUNCTION_ARGS) +{ + TSQuerySign *origval = (TSQuerySign *) DatumGetPointer(((GISTENTRY *) PG_GETARG_POINTER(0))->key); + TSQuerySign *newval = (TSQuerySign *) DatumGetPointer(((GISTENTRY *) PG_GETARG_POINTER(1))->key); + float *penalty = (float *) PG_GETARG_POINTER(2); + + *penalty = hemdist(*origval, *newval); + + PG_RETURN_POINTER(penalty); +} + + +typedef struct +{ + OffsetNumber pos; + int4 cost; +} SPLITCOST; + +static int +comparecost(const void *a, const void *b) +{ + if (((SPLITCOST *) a)->cost == ((SPLITCOST *) b)->cost) + return 0; + else + return (((SPLITCOST *) a)->cost > ((SPLITCOST *) b)->cost) ? 1 : -1; +} + +#define WISH_F(a,b,c) (double)( -(double)(((a)-(b))*((a)-(b))*((a)-(b)))*(c) ) + +Datum +gtsquery_picksplit(PG_FUNCTION_ARGS) +{ + GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); + GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1); + OffsetNumber maxoff = entryvec->n - 2; + OffsetNumber k, + j; + + TSQuerySign *datum_l, + *datum_r; + int4 size_alpha, + size_beta; + int4 size_waste, + waste = -1; + int4 nbytes; + OffsetNumber seed_1 = 0, + seed_2 = 0; + OffsetNumber *left, + *right; + + SPLITCOST *costvector; + + nbytes = (maxoff + 2) * sizeof(OffsetNumber); + left = v->spl_left = (OffsetNumber *) palloc(nbytes); + right = v->spl_right = (OffsetNumber *) palloc(nbytes); + v->spl_nleft = v->spl_nright = 0; + + for (k = FirstOffsetNumber; k < maxoff; k = OffsetNumberNext(k)) + for (j = OffsetNumberNext(k); j <= maxoff; j = OffsetNumberNext(j)) + { + size_waste = hemdist(*GETENTRY(entryvec, j), *GETENTRY(entryvec, k)); + if (size_waste > waste) + { + waste = size_waste; + seed_1 = k; + seed_2 = j; + } + } + + + if (seed_1 == 0 || seed_2 == 0) + { + seed_1 = 1; + seed_2 = 2; + } + + datum_l = (TSQuerySign *) palloc(sizeof(TSQuerySign)); + *datum_l = *GETENTRY(entryvec, seed_1); + datum_r = (TSQuerySign *) palloc(sizeof(TSQuerySign)); + *datum_r = *GETENTRY(entryvec, seed_2); + + + maxoff = OffsetNumberNext(maxoff); + costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff); + for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j)) + { + costvector[j - 1].pos = j; + size_alpha = hemdist(*GETENTRY(entryvec, seed_1), *GETENTRY(entryvec, j)); + size_beta = hemdist(*GETENTRY(entryvec, seed_2), *GETENTRY(entryvec, j)); + costvector[j - 1].cost = abs(size_alpha - size_beta); + } + qsort((void *) costvector, maxoff, sizeof(SPLITCOST), comparecost); + + for (k = 0; k < maxoff; k++) + { + j = costvector[k].pos; + if (j == seed_1) + { + *left++ = j; + v->spl_nleft++; + continue; + } + else if (j == seed_2) + { + *right++ = j; + v->spl_nright++; + continue; + } + size_alpha = hemdist(*datum_l, *GETENTRY(entryvec, j)); + size_beta = hemdist(*datum_r, *GETENTRY(entryvec, j)); + + if (size_alpha < size_beta + WISH_F(v->spl_nleft, v->spl_nright, 0.05)) + { + *datum_l |= *GETENTRY(entryvec, j); + *left++ = j; + v->spl_nleft++; + } + else + { + *datum_r |= *GETENTRY(entryvec, j); + *right++ = j; + v->spl_nright++; + } + } + + *right = *left = FirstOffsetNumber; + v->spl_ldatum = PointerGetDatum(datum_l); + v->spl_rdatum = PointerGetDatum(datum_r); + + PG_RETURN_POINTER(v); +} diff --git a/src/backend/utils/adt/tsquery_op.c b/src/backend/utils/adt/tsquery_op.c new file mode 100644 index 00000000000..fd97c2796df --- /dev/null +++ b/src/backend/utils/adt/tsquery_op.c @@ -0,0 +1,289 @@ +/*------------------------------------------------------------------------- + * + * tsquery_op.c + * Various operations with tsquery + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_op.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "tsearch/ts_type.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_utils.h" +#include "utils/pg_crc.h" + +Datum +tsquery_numnode(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY(0); + int nnode = query->size; + + PG_FREE_IF_COPY(query, 0); + PG_RETURN_INT32(nnode); +} + +static QTNode * +join_tsqueries(TSQuery a, TSQuery b) +{ + QTNode *res = (QTNode *) palloc0(sizeof(QTNode)); + + res->flags |= QTN_NEEDFREE; + + res->valnode = (QueryItem *) palloc0(sizeof(QueryItem)); + res->valnode->type = OPR; + + res->child = (QTNode **) palloc0(sizeof(QTNode *) * 2); + res->child[0] = QT2QTN(GETQUERY(b), GETOPERAND(b)); + res->child[1] = QT2QTN(GETQUERY(a), GETOPERAND(a)); + res->nchild = 2; + + return res; +} + +Datum +tsquery_and(PG_FUNCTION_ARGS) +{ + TSQuery a = PG_GETARG_TSQUERY_COPY(0); + TSQuery b = PG_GETARG_TSQUERY_COPY(1); + QTNode *res; + TSQuery query; + + if (a->size == 0) + { + PG_FREE_IF_COPY(a, 1); + PG_RETURN_POINTER(b); + } + else if (b->size == 0) + { + PG_FREE_IF_COPY(b, 1); + PG_RETURN_POINTER(a); + } + + res = join_tsqueries(a, b); + + res->valnode->val = '&'; + + query = QTN2QT(res); + + QTNFree(res); + PG_FREE_IF_COPY(a, 0); + PG_FREE_IF_COPY(b, 1); + + PG_RETURN_TSQUERY(query); +} + +Datum +tsquery_or(PG_FUNCTION_ARGS) +{ + TSQuery a = PG_GETARG_TSQUERY_COPY(0); + TSQuery b = PG_GETARG_TSQUERY_COPY(1); + QTNode *res; + TSQuery query; + + if (a->size == 0) + { + PG_FREE_IF_COPY(a, 1); + PG_RETURN_POINTER(b); + } + else if (b->size == 0) + { + PG_FREE_IF_COPY(b, 1); + PG_RETURN_POINTER(a); + } + + res = join_tsqueries(a, b); + + res->valnode->val = '|'; + + query = QTN2QT(res); + + QTNFree(res); + PG_FREE_IF_COPY(a, 0); + PG_FREE_IF_COPY(b, 1); + + PG_RETURN_POINTER(query); +} + +Datum +tsquery_not(PG_FUNCTION_ARGS) +{ + TSQuery a = PG_GETARG_TSQUERY_COPY(0); + QTNode *res; + TSQuery query; + + if (a->size == 0) + PG_RETURN_POINTER(a); + + res = (QTNode *) palloc0(sizeof(QTNode)); + + res->flags |= QTN_NEEDFREE; + + res->valnode = (QueryItem *) palloc0(sizeof(QueryItem)); + res->valnode->type = OPR; + res->valnode->val = '!'; + + res->child = (QTNode **) palloc0(sizeof(QTNode *)); + res->child[0] = QT2QTN(GETQUERY(a), GETOPERAND(a)); + res->nchild = 1; + + query = QTN2QT(res); + + QTNFree(res); + PG_FREE_IF_COPY(a, 0); + + PG_RETURN_POINTER(query); +} + +static int +CompareTSQ(TSQuery a, TSQuery b) +{ + if (a->size != b->size) + { + return (a->size < b->size) ? -1 : 1; + } + else if (VARSIZE(a) != VARSIZE(b)) + { + return (VARSIZE(a) < VARSIZE(b)) ? -1 : 1; + } + else + { + QTNode *an = QT2QTN(GETQUERY(a), GETOPERAND(a)); + QTNode *bn = QT2QTN(GETQUERY(b), GETOPERAND(b)); + int res = QTNodeCompare(an, bn); + + QTNFree(an); + QTNFree(bn); + + return res; + } + + return 0; +} + +Datum +tsquery_cmp(PG_FUNCTION_ARGS) +{ + TSQuery a = PG_GETARG_TSQUERY_COPY(0); + TSQuery b = PG_GETARG_TSQUERY_COPY(1); + int res = CompareTSQ(a, b); + + PG_FREE_IF_COPY(a, 0); + PG_FREE_IF_COPY(b, 1); + + PG_RETURN_INT32(res); +} + +#define CMPFUNC( NAME, CONDITION ) \ +Datum \ +NAME(PG_FUNCTION_ARGS) { \ + TSQuery a = PG_GETARG_TSQUERY_COPY(0); \ + TSQuery b = PG_GETARG_TSQUERY_COPY(1); \ + int res = CompareTSQ(a,b); \ + \ + PG_FREE_IF_COPY(a,0); \ + PG_FREE_IF_COPY(b,1); \ + \ + PG_RETURN_BOOL( CONDITION ); \ +} + +CMPFUNC(tsquery_lt, res < 0); +CMPFUNC(tsquery_le, res <= 0); +CMPFUNC(tsquery_eq, res == 0); +CMPFUNC(tsquery_ge, res >= 0); +CMPFUNC(tsquery_gt, res > 0); +CMPFUNC(tsquery_ne, res != 0); + +TSQuerySign +makeTSQuerySign(TSQuery a) +{ + int i; + QueryItem *ptr = GETQUERY(a); + TSQuerySign sign = 0; + + for (i = 0; i < a->size; i++) + { + if (ptr->type == VAL) + sign |= ((TSQuerySign) 1) << (ptr->val % TSQS_SIGLEN); + ptr++; + } + + return sign; +} + +Datum +tsq_mcontains(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY(0); + TSQuery ex = PG_GETARG_TSQUERY(1); + TSQuerySign sq, + se; + int i, + j; + QueryItem *iq, + *ie; + + if (query->size < ex->size) + { + PG_FREE_IF_COPY(query, 0); + PG_FREE_IF_COPY(ex, 1); + + PG_RETURN_BOOL(false); + } + + sq = makeTSQuerySign(query); + se = makeTSQuerySign(ex); + + if ((sq & se) != se) + { + PG_FREE_IF_COPY(query, 0); + PG_FREE_IF_COPY(ex, 1); + + PG_RETURN_BOOL(false); + } + + ie = GETQUERY(ex); + + for (i = 0; i < ex->size; i++) + { + iq = GETQUERY(query); + if (ie[i].type != VAL) + continue; + for (j = 0; j < query->size; j++) + if (iq[j].type == VAL && ie[i].val == iq[j].val) + { + j = query->size + 1; + break; + } + if (j == query->size) + { + PG_FREE_IF_COPY(query, 0); + PG_FREE_IF_COPY(ex, 1); + + PG_RETURN_BOOL(false); + } + } + + PG_FREE_IF_COPY(query, 0); + PG_FREE_IF_COPY(ex, 1); + + PG_RETURN_BOOL(true); +} + +Datum +tsq_mcontained(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM( + DirectFunctionCall2( + tsq_mcontains, + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(0) + ) + ); +} diff --git a/src/backend/utils/adt/tsquery_rewrite.c b/src/backend/utils/adt/tsquery_rewrite.c new file mode 100644 index 00000000000..f0d22c644ae --- /dev/null +++ b/src/backend/utils/adt/tsquery_rewrite.c @@ -0,0 +1,524 @@ +/*------------------------------------------------------------------------- + * + * tsquery_rewrite.c + * Utilities for reconstructing tsquery + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_rewrite.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "executor/spi.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" + + +static int +addone(int *counters, int last, int total) +{ + counters[last]++; + if (counters[last] >= total) + { + if (last == 0) + return 0; + if (addone(counters, last - 1, total - 1) == 0) + return 0; + counters[last] = counters[last - 1] + 1; + } + return 1; +} + +static QTNode * +findeq(QTNode *node, QTNode *ex, QTNode *subs, bool *isfind) +{ + + if ((node->sign & ex->sign) != ex->sign || node->valnode->type != ex->valnode->type || node->valnode->val != ex->valnode->val) + return node; + + if (node->flags & QTN_NOCHANGE) + return node; + + if (node->valnode->type == OPR) + { + if (node->nchild == ex->nchild) + { + if (QTNEq(node, ex)) + { + QTNFree(node); + if (subs) + { + node = QTNCopy(subs); + node->flags |= QTN_NOCHANGE; + } + else + node = NULL; + *isfind = true; + } + } + else if (node->nchild > ex->nchild) + { + int *counters = (int *) palloc(sizeof(int) * node->nchild); + int i; + QTNode *tnode = (QTNode *) palloc(sizeof(QTNode)); + + memset(tnode, 0, sizeof(QTNode)); + tnode->child = (QTNode **) palloc(sizeof(QTNode *) * ex->nchild); + tnode->nchild = ex->nchild; + tnode->valnode = (QueryItem *) palloc(sizeof(QueryItem)); + *(tnode->valnode) = *(ex->valnode); + + for (i = 0; i < ex->nchild; i++) + counters[i] = i; + + do + { + tnode->sign = 0; + for (i = 0; i < ex->nchild; i++) + { + tnode->child[i] = node->child[counters[i]]; + tnode->sign |= tnode->child[i]->sign; + } + + if (QTNEq(tnode, ex)) + { + int j = 0; + + pfree(tnode->valnode); + pfree(tnode->child); + pfree(tnode); + if (subs) + { + tnode = QTNCopy(subs); + tnode->flags = QTN_NOCHANGE | QTN_NEEDFREE; + } + else + tnode = NULL; + + node->child[counters[0]] = tnode; + + for (i = 1; i < ex->nchild; i++) + node->child[counters[i]] = NULL; + for (i = 0; i < node->nchild; i++) + { + if (node->child[i]) + { + node->child[j] = node->child[i]; + j++; + } + } + + node->nchild = j; + + *isfind = true; + + break; + } + } while (addone(counters, ex->nchild - 1, node->nchild)); + if (tnode && (tnode->flags & QTN_NOCHANGE) == 0) + { + pfree(tnode->valnode); + pfree(tnode->child); + pfree(tnode); + } + else + QTNSort(node); + pfree(counters); + } + } + else if (QTNEq(node, ex)) + { + QTNFree(node); + if (subs) + { + node = QTNCopy(subs); + node->flags |= QTN_NOCHANGE; + } + else + { + node = NULL; + } + *isfind = true; + } + + return node; +} + +static QTNode * +dofindsubquery(QTNode *root, QTNode *ex, QTNode *subs, bool *isfind) +{ + root = findeq(root, ex, subs, isfind); + + if (root && (root->flags & QTN_NOCHANGE) == 0 && root->valnode->type == OPR) + { + int i; + + for (i = 0; i < root->nchild; i++) + root->child[i] = dofindsubquery(root->child[i], ex, subs, isfind); + } + + return root; +} + +static QTNode * +dropvoidsubtree(QTNode * root) +{ + + if (!root) + return NULL; + + if (root->valnode->type == OPR) + { + int i, + j = 0; + + for (i = 0; i < root->nchild; i++) + { + if (root->child[i]) + { + root->child[j] = root->child[i]; + j++; + } + } + + root->nchild = j; + + if (root->valnode->val == (int4) '!' && root->nchild == 0) + { + QTNFree(root); + root = NULL; + } + else if (root->nchild == 1) + { + QTNode *nroot = root->child[0]; + + pfree(root); + root = nroot; + } + } + + return root; +} + +static QTNode * +findsubquery(QTNode *root, QTNode *ex, QTNode *subs, bool *isfind) +{ + bool DidFind = false; + + root = dofindsubquery(root, ex, subs, &DidFind); + + if (!subs && DidFind) + root = dropvoidsubtree(root); + + if (isfind) + *isfind = DidFind; + + return root; +} + +Datum +ts_rewrite_accum(PG_FUNCTION_ARGS) +{ + TSQuery acc; + ArrayType *qa; + TSQuery q; + QTNode *qex = NULL, + *subs = NULL, + *acctree = NULL; + bool isfind = false; + Datum *elemsp; + int nelemsp; + MemoryContext aggcontext; + MemoryContext oldcontext; + + aggcontext = ((AggState *) fcinfo->context)->aggcontext; + + if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL) + { + acc = (TSQuery) MemoryContextAlloc(aggcontext, HDRSIZETQ); + SET_VARSIZE(acc, HDRSIZETQ); + acc->size = 0; + } + else + acc = PG_GETARG_TSQUERY(0); + + if (PG_ARGISNULL(1) || PG_GETARG_POINTER(1) == NULL) + PG_RETURN_TSQUERY(acc); + else + qa = PG_GETARG_ARRAYTYPE_P_COPY(1); + + if (ARR_NDIM(qa) != 1) + elog(ERROR, "array must be one-dimensional, not %d dimensions", + ARR_NDIM(qa)); + if (ArrayGetNItems(ARR_NDIM(qa), ARR_DIMS(qa)) != 3) + elog(ERROR, "array should have only three elements"); + if (ARR_ELEMTYPE(qa) != TSQUERYOID) + elog(ERROR, "array should contain tsquery type"); + + deconstruct_array(qa, TSQUERYOID, -1, false, 'i', &elemsp, NULL, &nelemsp); + + q = DatumGetTSQuery(elemsp[0]); + if (q->size == 0) + { + pfree(elemsp); + PG_RETURN_POINTER(acc); + } + + if (!acc->size) + { + if (VARSIZE(acc) > HDRSIZETQ) + { + pfree(elemsp); + PG_RETURN_POINTER(acc); + } + else + acctree = QT2QTN(GETQUERY(q), GETOPERAND(q)); + } + else + acctree = QT2QTN(GETQUERY(acc), GETOPERAND(acc)); + + QTNTernary(acctree); + QTNSort(acctree); + + q = DatumGetTSQuery(elemsp[1]); + if (q->size == 0) + { + pfree(elemsp); + PG_RETURN_POINTER(acc); + } + qex = QT2QTN(GETQUERY(q), GETOPERAND(q)); + QTNTernary(qex); + QTNSort(qex); + + q = DatumGetTSQuery(elemsp[2]); + if (q->size) + subs = QT2QTN(GETQUERY(q), GETOPERAND(q)); + + acctree = findsubquery(acctree, qex, subs, &isfind); + + if (isfind || !acc->size) + { + /* pfree( acc ); do not pfree(p), because nodeAgg.c will */ + if (acctree) + { + QTNBinary(acctree); + oldcontext = MemoryContextSwitchTo(aggcontext); + acc = QTN2QT(acctree); + MemoryContextSwitchTo(oldcontext); + } + else + { + acc = (TSQuery) MemoryContextAlloc(aggcontext, HDRSIZETQ); + SET_VARSIZE(acc, HDRSIZETQ); + acc->size = 0; + } + } + + pfree(elemsp); + QTNFree(qex); + QTNFree(subs); + QTNFree(acctree); + + PG_RETURN_TSQUERY(acc); +} + +Datum +ts_rewrite_finish(PG_FUNCTION_ARGS) +{ + TSQuery acc = PG_GETARG_TSQUERY(0); + TSQuery rewrited; + + if (acc == NULL || PG_ARGISNULL(0) || acc->size == 0) + { + rewrited = (TSQuery) palloc(HDRSIZETQ); + SET_VARSIZE(rewrited, HDRSIZETQ); + rewrited->size = 0; + } + else + { + rewrited = (TSQuery) palloc(VARSIZE(acc)); + memcpy(rewrited, acc, VARSIZE(acc)); + pfree(acc); + } + + PG_RETURN_POINTER(rewrited); +} + +Datum +tsquery_rewrite(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY_COPY(0); + text *in = PG_GETARG_TEXT_P(1); + TSQuery rewrited = query; + MemoryContext outercontext = CurrentMemoryContext; + MemoryContext oldcontext; + QTNode *tree; + char *buf; + void *plan; + Portal portal; + bool isnull; + int i; + + if (query->size == 0) + { + PG_FREE_IF_COPY(in, 1); + PG_RETURN_POINTER(rewrited); + } + + tree = QT2QTN(GETQUERY(query), GETOPERAND(query)); + QTNTernary(tree); + QTNSort(tree); + + buf = TextPGetCString(in); + + SPI_connect(); + + if ((plan = SPI_prepare(buf, 0, NULL)) == NULL) + elog(ERROR, "SPI_prepare(\"%s\") failed", buf); + + if ((portal = SPI_cursor_open(NULL, plan, NULL, NULL, false)) == NULL) + elog(ERROR, "SPI_cursor_open(\"%s\") failed", buf); + + SPI_cursor_fetch(portal, true, 100); + + if (SPI_tuptable->tupdesc->natts != 2 || + SPI_gettypeid(SPI_tuptable->tupdesc, 1) != TSQUERYOID || + SPI_gettypeid(SPI_tuptable->tupdesc, 2) != TSQUERYOID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("ts_rewrite query must return two tsquery columns"))); + + while (SPI_processed > 0 && tree) + { + for (i = 0; i < SPI_processed && tree; i++) + { + Datum qdata = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull); + Datum sdata; + + if (isnull) + continue; + + sdata = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 2, &isnull); + + if (!isnull) + { + TSQuery qtex = DatumGetTSQuery(qdata); + TSQuery qtsubs = DatumGetTSQuery(sdata); + QTNode *qex, + *qsubs = NULL; + + if (qtex->size == 0) + { + if (qtex != (TSQuery) DatumGetPointer(qdata)) + pfree(qtex); + if (qtsubs != (TSQuery) DatumGetPointer(sdata)) + pfree(qtsubs); + continue; + } + + qex = QT2QTN(GETQUERY(qtex), GETOPERAND(qtex)); + + QTNTernary(qex); + QTNSort(qex); + + if (qtsubs->size) + qsubs = QT2QTN(GETQUERY(qtsubs), GETOPERAND(qtsubs)); + + oldcontext = MemoryContextSwitchTo(outercontext); + tree = findsubquery(tree, qex, qsubs, NULL); + MemoryContextSwitchTo(oldcontext); + + QTNFree(qex); + if (qtex != (TSQuery) DatumGetPointer(qdata)) + pfree(qtex); + QTNFree(qsubs); + if (qtsubs != (TSQuery) DatumGetPointer(sdata)) + pfree(qtsubs); + } + } + + SPI_freetuptable(SPI_tuptable); + SPI_cursor_fetch(portal, true, 100); + } + + SPI_freetuptable(SPI_tuptable); + SPI_cursor_close(portal); + SPI_freeplan(plan); + SPI_finish(); + + if (tree) + { + QTNBinary(tree); + rewrited = QTN2QT(tree); + QTNFree(tree); + PG_FREE_IF_COPY(query, 0); + } + else + { + SET_VARSIZE(rewrited, HDRSIZETQ); + rewrited->size = 0; + } + + pfree(buf); + PG_FREE_IF_COPY(in, 1); + PG_RETURN_POINTER(rewrited); +} + +Datum +tsquery_rewrite_query(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY_COPY(0); + TSQuery ex = PG_GETARG_TSQUERY(1); + TSQuery subst = PG_GETARG_TSQUERY(2); + TSQuery rewrited = query; + QTNode *tree, + *qex, + *subs = NULL; + + if (query->size == 0 || ex->size == 0) + { + PG_FREE_IF_COPY(ex, 1); + PG_FREE_IF_COPY(subst, 2); + PG_RETURN_POINTER(rewrited); + } + + tree = QT2QTN(GETQUERY(query), GETOPERAND(query)); + QTNTernary(tree); + QTNSort(tree); + + qex = QT2QTN(GETQUERY(ex), GETOPERAND(ex)); + QTNTernary(qex); + QTNSort(qex); + + if (subst->size) + subs = QT2QTN(GETQUERY(subst), GETOPERAND(subst)); + + tree = findsubquery(tree, qex, subs, NULL); + QTNFree(qex); + QTNFree(subs); + + if (!tree) + { + SET_VARSIZE(rewrited, HDRSIZETQ); + rewrited->size = 0; + PG_FREE_IF_COPY(ex, 1); + PG_FREE_IF_COPY(subst, 2); + PG_RETURN_POINTER(rewrited); + } + else + { + QTNBinary(tree); + rewrited = QTN2QT(tree); + QTNFree(tree); + } + + PG_FREE_IF_COPY(query, 0); + PG_FREE_IF_COPY(ex, 1); + PG_FREE_IF_COPY(subst, 2); + PG_RETURN_POINTER(rewrited); +} diff --git a/src/backend/utils/adt/tsquery_util.c b/src/backend/utils/adt/tsquery_util.c new file mode 100644 index 00000000000..ae8cc318da9 --- /dev/null +++ b/src/backend/utils/adt/tsquery_util.c @@ -0,0 +1,317 @@ +/*------------------------------------------------------------------------- + * + * tsquery_util.c + * Utilities for tsquery datatype + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_util.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" + + +QTNode * +QT2QTN(QueryItem * in, char *operand) +{ + QTNode *node = (QTNode *) palloc0(sizeof(QTNode)); + + node->valnode = in; + + if (in->type == OPR) + { + node->child = (QTNode **) palloc0(sizeof(QTNode *) * 2); + node->child[0] = QT2QTN(in + 1, operand); + node->sign = node->child[0]->sign; + if (in->val == (int4) '!') + node->nchild = 1; + else + { + node->nchild = 2; + node->child[1] = QT2QTN(in + in->left, operand); + node->sign |= node->child[1]->sign; + } + } + else if (operand) + { + node->word = operand + in->distance; + node->sign = 1 << (in->val % 32); + } + + return node; +} + +void +QTNFree(QTNode * in) +{ + if (!in) + return; + + if (in->valnode->type == VAL && in->word && (in->flags & QTN_WORDFREE) != 0) + pfree(in->word); + + if (in->child) + { + if (in->valnode) + { + if (in->valnode->type == OPR && in->nchild > 0) + { + int i; + + for (i = 0; i < in->nchild; i++) + QTNFree(in->child[i]); + } + if (in->flags & QTN_NEEDFREE) + pfree(in->valnode); + } + pfree(in->child); + } + + pfree(in); +} + +int +QTNodeCompare(QTNode * an, QTNode * bn) +{ + if (an->valnode->type != bn->valnode->type) + return (an->valnode->type > bn->valnode->type) ? -1 : 1; + else if (an->valnode->val != bn->valnode->val) + return (an->valnode->val > bn->valnode->val) ? -1 : 1; + else if (an->valnode->type == VAL) + { + if (an->valnode->length == bn->valnode->length) + return strncmp(an->word, bn->word, an->valnode->length); + else + return (an->valnode->length > bn->valnode->length) ? -1 : 1; + } + else if (an->nchild != bn->nchild) + { + return (an->nchild > bn->nchild) ? -1 : 1; + } + else + { + int i, + res; + + for (i = 0; i < an->nchild; i++) + if ((res = QTNodeCompare(an->child[i], bn->child[i])) != 0) + return res; + } + + return 0; +} + +static int +cmpQTN(const void *a, const void *b) +{ + return QTNodeCompare(*(QTNode **) a, *(QTNode **) b); +} + +void +QTNSort(QTNode * in) +{ + int i; + + if (in->valnode->type != OPR) + return; + + for (i = 0; i < in->nchild; i++) + QTNSort(in->child[i]); + if (in->nchild > 1) + qsort((void *) in->child, in->nchild, sizeof(QTNode *), cmpQTN); +} + +bool +QTNEq(QTNode * a, QTNode * b) +{ + uint32 sign = a->sign & b->sign; + + if (!(sign == a->sign && sign == b->sign)) + return 0; + + return (QTNodeCompare(a, b) == 0) ? true : false; +} + +void +QTNTernary(QTNode * in) +{ + int i; + + if (in->valnode->type != OPR) + return; + + for (i = 0; i < in->nchild; i++) + QTNTernary(in->child[i]); + + for (i = 0; i < in->nchild; i++) + { + if (in->valnode->type == in->child[i]->valnode->type && in->valnode->val == in->child[i]->valnode->val) + { + QTNode *cc = in->child[i]; + int oldnchild = in->nchild; + + in->nchild += cc->nchild - 1; + in->child = (QTNode **) repalloc(in->child, in->nchild * sizeof(QTNode *)); + + if (i + 1 != oldnchild) + memmove(in->child + i + cc->nchild, in->child + i + 1, + (oldnchild - i - 1) * sizeof(QTNode *)); + + memcpy(in->child + i, cc->child, cc->nchild * sizeof(QTNode *)); + i += cc->nchild - 1; + + pfree(cc); + } + } +} + +void +QTNBinary(QTNode * in) +{ + int i; + + if (in->valnode->type != OPR) + return; + + for (i = 0; i < in->nchild; i++) + QTNBinary(in->child[i]); + + if (in->nchild <= 2) + return; + + while (in->nchild > 2) + { + QTNode *nn = (QTNode *) palloc0(sizeof(QTNode)); + + nn->valnode = (QueryItem *) palloc0(sizeof(QueryItem)); + nn->child = (QTNode **) palloc0(sizeof(QTNode *) * 2); + + nn->nchild = 2; + nn->flags = QTN_NEEDFREE; + + nn->child[0] = in->child[0]; + nn->child[1] = in->child[1]; + nn->sign = nn->child[0]->sign | nn->child[1]->sign; + + nn->valnode->type = in->valnode->type; + nn->valnode->val = in->valnode->val; + + in->child[0] = nn; + in->child[1] = in->child[in->nchild - 1]; + in->nchild--; + } +} + +static void +cntsize(QTNode * in, int4 *sumlen, int4 *nnode) +{ + *nnode += 1; + if (in->valnode->type == OPR) + { + int i; + + for (i = 0; i < in->nchild; i++) + cntsize(in->child[i], sumlen, nnode); + } + else + { + *sumlen += in->valnode->length + 1; + } +} + +typedef struct +{ + QueryItem *curitem; + char *operand; + char *curoperand; +} QTN2QTState; + +static void +fillQT(QTN2QTState * state, QTNode * in) +{ + *(state->curitem) = *(in->valnode); + + if (in->valnode->type == VAL) + { + memcpy(state->curoperand, in->word, in->valnode->length); + state->curitem->distance = state->curoperand - state->operand; + state->curoperand[in->valnode->length] = '\0'; + state->curoperand += in->valnode->length + 1; + state->curitem++; + } + else + { + QueryItem *curitem = state->curitem; + + Assert(in->nchild <= 2); + state->curitem++; + + fillQT(state, in->child[0]); + + if (in->nchild == 2) + { + curitem->left = state->curitem - curitem; + fillQT(state, in->child[1]); + } + } +} + +TSQuery +QTN2QT(QTNode *in) +{ + TSQuery out; + int len; + int sumlen = 0, + nnode = 0; + QTN2QTState state; + + cntsize(in, &sumlen, &nnode); + len = COMPUTESIZE(nnode, sumlen); + + out = (TSQuery) palloc(len); + SET_VARSIZE(out, len); + out->size = nnode; + + state.curitem = GETQUERY(out); + state.operand = state.curoperand = GETOPERAND(out); + + fillQT(&state, in); + return out; +} + +QTNode * +QTNCopy(QTNode *in) +{ + QTNode *out = (QTNode *) palloc(sizeof(QTNode)); + + *out = *in; + out->valnode = (QueryItem *) palloc(sizeof(QueryItem)); + *(out->valnode) = *(in->valnode); + out->flags |= QTN_NEEDFREE; + + if (in->valnode->type == VAL) + { + out->word = palloc(in->valnode->length + 1); + memcpy(out->word, in->word, in->valnode->length); + out->word[in->valnode->length] = '\0'; + out->flags |= QTN_WORDFREE; + } + else + { + int i; + + out->child = (QTNode **) palloc(sizeof(QTNode *) * in->nchild); + + for (i = 0; i < in->nchild; i++) + out->child[i] = QTNCopy(in->child[i]); + } + + return out; +} diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c new file mode 100644 index 00000000000..8b2ab884c8c --- /dev/null +++ b/src/backend/utils/adt/tsrank.c @@ -0,0 +1,804 @@ +/*------------------------------------------------------------------------- + * + * tsrank.c + * rank tsvector by tsquery + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <math.h> + +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" +#include "utils/array.h" + + +static float weights[] = {0.1, 0.2, 0.4, 1.0}; + +#define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] ) + +#define RANK_NO_NORM 0x00 +#define RANK_NORM_LOGLENGTH 0x01 +#define RANK_NORM_LENGTH 0x02 +#define RANK_NORM_EXTDIST 0x04 +#define RANK_NORM_UNIQ 0x08 +#define RANK_NORM_LOGUNIQ 0x10 +#define DEF_NORM_METHOD RANK_NO_NORM + +static float calc_rank_or(float *w, TSVector t, TSQuery q); +static float calc_rank_and(float *w, TSVector t, TSQuery q); + +/* + * Returns a weight of a word collocation + */ +static float4 +word_distance(int4 w) +{ + if (w > 100) + return 1e-30; + + return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2)); +} + +static int +cnt_length(TSVector t) +{ + WordEntry *ptr = ARRPTR(t), + *end = (WordEntry *) STRPTR(t); + int len = 0, + clen; + + while (ptr < end) + { + if ((clen = POSDATALEN(t, ptr)) == 0) + len += 1; + else + len += clen; + ptr++; + } + + return len; +} + +static int4 +WordECompareQueryItem(char *eval, char *qval, WordEntry * ptr, QueryItem * item) +{ + if (ptr->len == item->length) + return strncmp( + eval + ptr->pos, + qval + item->distance, + item->length); + + return (ptr->len > item->length) ? 1 : -1; +} + +static WordEntry * +find_wordentry(TSVector t, TSQuery q, QueryItem * item) +{ + WordEntry *StopLow = ARRPTR(t); + WordEntry *StopHigh = (WordEntry *) STRPTR(t); + WordEntry *StopMiddle; + int difference; + + /* Loop invariant: StopLow <= item < StopHigh */ + + while (StopLow < StopHigh) + { + StopMiddle = StopLow + (StopHigh - StopLow) / 2; + difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item); + if (difference == 0) + return StopMiddle; + else if (difference < 0) + StopLow = StopMiddle + 1; + else + StopHigh = StopMiddle; + } + + return NULL; +} + + +static int +compareQueryItem(const void *a, const void *b, void *arg) +{ + char *operand = (char *) arg; + + if ((*(QueryItem **) a)->length == (*(QueryItem **) b)->length) + return strncmp(operand + (*(QueryItem **) a)->distance, + operand + (*(QueryItem **) b)->distance, + (*(QueryItem **) b)->length); + + return ((*(QueryItem **) a)->length > (*(QueryItem **) b)->length) ? 1 : -1; +} + +static QueryItem ** +SortAndUniqItems(char *operand, QueryItem * item, int *size) +{ + QueryItem **res, + **ptr, + **prevptr; + + ptr = res = (QueryItem **) palloc(sizeof(QueryItem *) * *size); + + while ((*size)--) + { + if (item->type == VAL) + { + *ptr = item; + ptr++; + } + item++; + } + + *size = ptr - res; + if (*size < 2) + return res; + + qsort_arg(res, *size, sizeof(QueryItem **), compareQueryItem, (void *) operand); + + ptr = res + 1; + prevptr = res; + + while (ptr - res < *size) + { + if (compareQueryItem((void *) ptr, (void *) prevptr, (void *) operand) != 0) + { + prevptr++; + *prevptr = *ptr; + } + ptr++; + } + + *size = prevptr + 1 - res; + return res; +} + +static WordEntryPos POSNULL[] = { + 0, + 0 +}; + +static float +calc_rank_and(float *w, TSVector t, TSQuery q) +{ + uint16 **pos; + int i, + k, + l, + p; + WordEntry *entry; + WordEntryPos *post, + *ct; + int4 dimt, + lenct, + dist; + float res = -1.0; + QueryItem **item; + int size = q->size; + + item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size); + if (size < 2) + { + pfree(item); + return calc_rank_or(w, t, q); + } + pos = (uint16 **) palloc(sizeof(uint16 *) * q->size); + memset(pos, 0, sizeof(uint16 *) * q->size); + *(uint16 *) POSNULL = lengthof(POSNULL) - 1; + WEP_SETPOS(POSNULL[1], MAXENTRYPOS - 1); + + for (i = 0; i < size; i++) + { + entry = find_wordentry(t, q, item[i]); + if (!entry) + continue; + + if (entry->haspos) + pos[i] = (uint16 *) _POSDATAPTR(t, entry); + else + pos[i] = (uint16 *) POSNULL; + + + dimt = *(uint16 *) (pos[i]); + post = (WordEntryPos *) (pos[i] + 1); + for (k = 0; k < i; k++) + { + if (!pos[k]) + continue; + lenct = *(uint16 *) (pos[k]); + ct = (WordEntryPos *) (pos[k] + 1); + for (l = 0; l < dimt; l++) + { + for (p = 0; p < lenct; p++) + { + dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p])); + if (dist || (dist == 0 && (pos[i] == (uint16 *) POSNULL || pos[k] == (uint16 *) POSNULL))) + { + float curw; + + if (!dist) + dist = MAXENTRYPOS; + curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist)); + res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw); + } + } + } + } + } + pfree(pos); + pfree(item); + return res; +} + +static float +calc_rank_or(float *w, TSVector t, TSQuery q) +{ + WordEntry *entry; + WordEntryPos *post; + int4 dimt, + j, + i; + float res = 0.0; + QueryItem **item; + int size = q->size; + + *(uint16 *) POSNULL = lengthof(POSNULL) - 1; + item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size); + + for (i = 0; i < size; i++) + { + float resj, + wjm; + int4 jm; + + entry = find_wordentry(t, q, item[i]); + if (!entry) + continue; + + if (entry->haspos) + { + dimt = POSDATALEN(t, entry); + post = POSDATAPTR(t, entry); + } + else + { + dimt = *(uint16 *) POSNULL; + post = POSNULL + 1; + } + + resj = 0.0; + wjm = -1.0; + jm = 0; + for (j = 0; j < dimt; j++) + { + resj = resj + wpos(post[j]) / ((j + 1) * (j + 1)); + if (wpos(post[j]) > wjm) + { + wjm = wpos(post[j]); + jm = j; + } + } +/* + limit (sum(i/i^2),i->inf) = pi^2/6 + resj = sum(wi/i^2),i=1,noccurence, + wi - should be sorted desc, + don't sort for now, just choose maximum weight. This should be corrected + Oleg Bartunov +*/ + res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685; + } + if (size > 0) + res = res / size; + pfree(item); + return res; +} + +static float +calc_rank(float *w, TSVector t, TSQuery q, int4 method) +{ + QueryItem *item = GETQUERY(q); + float res = 0.0; + int len; + + if (!t->size || !q->size) + return 0.0; + + res = (item->type != VAL && item->val == (int4) '&') ? + calc_rank_and(w, t, q) : calc_rank_or(w, t, q); + + if (res < 0) + res = 1e-20; + + if ((method & RANK_NORM_LOGLENGTH) && t->size > 0) + res /= log((double) (cnt_length(t) + 1)) / log(2.0); + + if (method & RANK_NORM_LENGTH) + { + len = cnt_length(t); + if (len > 0) + res /= (float) len; + } + + if ((method & RANK_NORM_UNIQ) && t->size > 0) + res /= (float) (t->size); + + if ((method & RANK_NORM_LOGUNIQ) && t->size > 0) + res /= log((double) (t->size + 1)) / log(2.0); + + return res; +} + +static float * +getWeights(ArrayType *win) +{ + static float ws[lengthof(weights)]; + int i; + float4 *arrdata; + + if (win == 0) + return weights; + + if (ARR_NDIM(win) != 1) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("array of weight must be one-dimensional"))); + + if (ArrayGetNItems(ARR_NDIM(win), ARR_DIMS(win)) < lengthof(weights)) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("array of weight is too short"))); + + if (ARR_HASNULL(win)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("array of weight must not contain nulls"))); + + arrdata = (float4 *) ARR_DATA_PTR(win); + for (i = 0; i < lengthof(weights); i++) + { + ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i]; + if (ws[i] > 1.0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("weight out of range"))); + } + + return ws; +} + +Datum +ts_rank_wttf(PG_FUNCTION_ARGS) +{ + ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); + TSVector txt = PG_GETARG_TSVECTOR(1); + TSQuery query = PG_GETARG_TSQUERY(2); + int method = PG_GETARG_INT32(3); + float res; + + res = calc_rank(getWeights(win), txt, query, method); + + PG_FREE_IF_COPY(win, 0); + PG_FREE_IF_COPY(txt, 1); + PG_FREE_IF_COPY(query, 2); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rank_wtt(PG_FUNCTION_ARGS) +{ + ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); + TSVector txt = PG_GETARG_TSVECTOR(1); + TSQuery query = PG_GETARG_TSQUERY(2); + float res; + + res = calc_rank(getWeights(win), txt, query, DEF_NORM_METHOD); + + PG_FREE_IF_COPY(win, 0); + PG_FREE_IF_COPY(txt, 1); + PG_FREE_IF_COPY(query, 2); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rank_ttf(PG_FUNCTION_ARGS) +{ + TSVector txt = PG_GETARG_TSVECTOR(0); + TSQuery query = PG_GETARG_TSQUERY(1); + int method = PG_GETARG_INT32(2); + float res; + + res = calc_rank(getWeights(NULL), txt, query, method); + + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rank_tt(PG_FUNCTION_ARGS) +{ + TSVector txt = PG_GETARG_TSVECTOR(0); + TSQuery query = PG_GETARG_TSQUERY(1); + float res; + + res = calc_rank(getWeights(NULL), txt, query, DEF_NORM_METHOD); + + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_FLOAT4(res); +} + +typedef struct +{ + QueryItem **item; + int16 nitem; + bool needfree; + uint8 wclass; + int32 pos; +} DocRepresentation; + +static int +compareDocR(const void *a, const void *b) +{ + if (((DocRepresentation *) a)->pos == ((DocRepresentation *) b)->pos) + return 0; + return (((DocRepresentation *) a)->pos > ((DocRepresentation *) b)->pos) ? 1 : -1; +} + +static bool +checkcondition_QueryItem(void *checkval, QueryItem * val) +{ + return (bool) (val->istrue); +} + +static void +reset_istrue_flag(TSQuery query) +{ + QueryItem *item = GETQUERY(query); + int i; + + /* reset istrue flag */ + for (i = 0; i < query->size; i++) + { + if (item->type == VAL) + item->istrue = 0; + item++; + } +} + +typedef struct +{ + int pos; + int p; + int q; + DocRepresentation *begin; + DocRepresentation *end; +} Extention; + + +static bool +Cover(DocRepresentation * doc, int len, TSQuery query, Extention * ext) +{ + DocRepresentation *ptr; + int lastpos = ext->pos; + int i; + bool found = false; + + reset_istrue_flag(query); + + ext->p = 0x7fffffff; + ext->q = 0; + ptr = doc + ext->pos; + + /* find upper bound of cover from current position, move up */ + while (ptr - doc < len) + { + for (i = 0; i < ptr->nitem; i++) + ptr->item[i]->istrue = 1; + if (TS_execute(GETQUERY(query), NULL, false, checkcondition_QueryItem)) + { + if (ptr->pos > ext->q) + { + ext->q = ptr->pos; + ext->end = ptr; + lastpos = ptr - doc; + found = true; + } + break; + } + ptr++; + } + + if (!found) + return false; + + reset_istrue_flag(query); + + ptr = doc + lastpos; + + /* find lower bound of cover from founded upper bound, move down */ + while (ptr >= doc + ext->pos) + { + for (i = 0; i < ptr->nitem; i++) + ptr->item[i]->istrue = 1; + if (TS_execute(GETQUERY(query), NULL, true, checkcondition_QueryItem)) + { + if (ptr->pos < ext->p) + { + ext->begin = ptr; + ext->p = ptr->pos; + } + break; + } + ptr--; + } + + if (ext->p <= ext->q) + { + /* + * set position for next try to next lexeme after begining of founded + * cover + */ + ext->pos = (ptr - doc) + 1; + return true; + } + + ext->pos++; + return Cover(doc, len, query, ext); +} + +static DocRepresentation * +get_docrep(TSVector txt, TSQuery query, int *doclen) +{ + QueryItem *item = GETQUERY(query); + WordEntry *entry; + WordEntryPos *post; + int4 dimt, + j, + i; + int len = query->size * 4, + cur = 0; + DocRepresentation *doc; + char *operand; + + *(uint16 *) POSNULL = lengthof(POSNULL) - 1; + doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len); + operand = GETOPERAND(query); + reset_istrue_flag(query); + + for (i = 0; i < query->size; i++) + { + if (item[i].type != VAL || item[i].istrue) + continue; + + entry = find_wordentry(txt, query, &(item[i])); + if (!entry) + continue; + + if (entry->haspos) + { + dimt = POSDATALEN(txt, entry); + post = POSDATAPTR(txt, entry); + } + else + { + dimt = *(uint16 *) POSNULL; + post = POSNULL + 1; + } + + while (cur + dimt >= len) + { + len *= 2; + doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len); + } + + for (j = 0; j < dimt; j++) + { + if (j == 0) + { + QueryItem *kptr, + *iptr = item + i; + int k; + + doc[cur].needfree = false; + doc[cur].nitem = 0; + doc[cur].item = (QueryItem **) palloc(sizeof(QueryItem *) * query->size); + + for (k = 0; k < query->size; k++) + { + kptr = item + k; + if (k == i || + (item[k].type == VAL && + compareQueryItem(&kptr, &iptr, operand) == 0)) + { + doc[cur].item[doc[cur].nitem] = item + k; + doc[cur].nitem++; + kptr->istrue = 1; + } + } + } + else + { + doc[cur].needfree = false; + doc[cur].nitem = doc[cur - 1].nitem; + doc[cur].item = doc[cur - 1].item; + } + doc[cur].pos = WEP_GETPOS(post[j]); + doc[cur].wclass = WEP_GETWEIGHT(post[j]); + cur++; + } + } + + *doclen = cur; + + if (cur > 0) + { + if (cur > 1) + qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR); + return doc; + } + + pfree(doc); + return NULL; +} + +static float4 +calc_rank_cd(float4 *arrdata, TSVector txt, TSQuery query, int method) +{ + DocRepresentation *doc; + int len, + i, + doclen = 0; + Extention ext; + double Wdoc = 0.0; + double invws[lengthof(weights)]; + double SumDist = 0.0, + PrevExtPos = 0.0, + CurExtPos = 0.0; + int NExtent = 0; + + for (i = 0; i < lengthof(weights); i++) + { + invws[i] = ((double) ((arrdata[i] >= 0) ? arrdata[i] : weights[i])); + if (invws[i] > 1.0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("weight out of range"))); + invws[i] = 1.0 / invws[i]; + } + + doc = get_docrep(txt, query, &doclen); + if (!doc) + return 0.0; + + MemSet(&ext, 0, sizeof(Extention)); + while (Cover(doc, doclen, query, &ext)) + { + double Cpos = 0.0; + double InvSum = 0.0; + int nNoise; + DocRepresentation *ptr = ext.begin; + + while (ptr <= ext.end) + { + InvSum += invws[ptr->wclass]; + ptr++; + } + + Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum; + + /* + * if doc are big enough then ext.q may be equal to ext.p due to limit + * of posional information. In this case we approximate number of + * noise word as half cover's length + */ + nNoise = (ext.q - ext.p) - (ext.end - ext.begin); + if (nNoise < 0) + nNoise = (ext.end - ext.begin) / 2; + Wdoc += Cpos / ((double) (1 + nNoise)); + + CurExtPos = ((double) (ext.q + ext.p)) / 2.0; + if (NExtent > 0 && CurExtPos > PrevExtPos /* prevent devision by + * zero in a case of + multiple lexize */ ) + SumDist += 1.0 / (CurExtPos - PrevExtPos); + + PrevExtPos = CurExtPos; + NExtent++; + } + + if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0) + Wdoc /= log((double) (cnt_length(txt) + 1)); + + if (method & RANK_NORM_LENGTH) + { + len = cnt_length(txt); + if (len > 0) + Wdoc /= (double) len; + } + + if ((method & RANK_NORM_EXTDIST) && SumDist > 0) + Wdoc /= ((double) NExtent) / SumDist; + + if ((method & RANK_NORM_UNIQ) && txt->size > 0) + Wdoc /= (double) (txt->size); + + if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0) + Wdoc /= log((double) (txt->size + 1)) / log(2.0); + + for (i = 0; i < doclen; i++) + if (doc[i].needfree) + pfree(doc[i].item); + pfree(doc); + + return (float4) Wdoc; +} + +Datum +ts_rankcd_wttf(PG_FUNCTION_ARGS) +{ + ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); + TSVector txt = PG_GETARG_TSVECTOR(1); + TSQuery query = PG_GETARG_TSQUERY_COPY(2); + int method = PG_GETARG_INT32(3); + float res; + + res = calc_rank_cd(getWeights(win), txt, query, method); + + PG_FREE_IF_COPY(win, 0); + PG_FREE_IF_COPY(txt, 1); + PG_FREE_IF_COPY(query, 2); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rankcd_wtt(PG_FUNCTION_ARGS) +{ + ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); + TSVector txt = PG_GETARG_TSVECTOR(1); + TSQuery query = PG_GETARG_TSQUERY_COPY(2); + float res; + + res = calc_rank_cd(getWeights(win), txt, query, DEF_NORM_METHOD); + + PG_FREE_IF_COPY(win, 0); + PG_FREE_IF_COPY(txt, 1); + PG_FREE_IF_COPY(query, 2); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rankcd_ttf(PG_FUNCTION_ARGS) +{ + TSVector txt = PG_GETARG_TSVECTOR(0); + TSQuery query = PG_GETARG_TSQUERY_COPY(1); + int method = PG_GETARG_INT32(2); + float res; + + res = calc_rank_cd(getWeights(NULL), txt, query, method); + + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rankcd_tt(PG_FUNCTION_ARGS) +{ + TSVector txt = PG_GETARG_TSVECTOR(0); + TSQuery query = PG_GETARG_TSQUERY_COPY(1); + float res; + + res = calc_rank_cd(getWeights(NULL), txt, query, DEF_NORM_METHOD); + + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_FLOAT4(res); +} diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c new file mode 100644 index 00000000000..04b6345e162 --- /dev/null +++ b/src/backend/utils/adt/tsvector.c @@ -0,0 +1,683 @@ +/*------------------------------------------------------------------------- + * + * tsvector.c + * I/O functions for tsvector + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "libpq/pqformat.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_utils.h" +#include "utils/memutils.h" + + +static int +comparePos(const void *a, const void *b) +{ + if (WEP_GETPOS(*(WordEntryPos *) a) == WEP_GETPOS(*(WordEntryPos *) b)) + return 0; + return (WEP_GETPOS(*(WordEntryPos *) a) > WEP_GETPOS(*(WordEntryPos *) b)) ? 1 : -1; +} + +static int +uniquePos(WordEntryPos * a, int4 l) +{ + WordEntryPos *ptr, + *res; + + if (l == 1) + return l; + + res = a; + qsort((void *) a, l, sizeof(WordEntryPos), comparePos); + + ptr = a + 1; + while (ptr - a < l) + { + if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res)) + { + res++; + *res = *ptr; + if (res - a >= MAXNUMPOS - 1 || WEP_GETPOS(*res) == MAXENTRYPOS - 1) + break; + } + else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res)) + WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr)); + ptr++; + } + + return res + 1 - a; +} + +static int +compareentry(const void *a, const void *b, void *arg) +{ + char *BufferStr = (char *) arg; + + if (((WordEntryIN *) a)->entry.len == ((WordEntryIN *) b)->entry.len) + { + return strncmp(&BufferStr[((WordEntryIN *) a)->entry.pos], + &BufferStr[((WordEntryIN *) b)->entry.pos], + ((WordEntryIN *) a)->entry.len); + } + + return (((WordEntryIN *) a)->entry.len > ((WordEntryIN *) b)->entry.len) ? 1 : -1; +} + +static int +uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen) +{ + WordEntryIN *ptr, + *res; + + res = a; + if (l == 1) + { + if (a->entry.haspos) + { + *(uint16 *) (a->pos) = uniquePos(&(a->pos[1]), *(uint16 *) (a->pos)); + *outbuflen = SHORTALIGN(res->entry.len) + (*(uint16 *) (a->pos) + 1) * sizeof(WordEntryPos); + } + return l; + } + + ptr = a + 1; + qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, (void *) buf); + + while (ptr - a < l) + { + if (!(ptr->entry.len == res->entry.len && + strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], res->entry.len) == 0)) + { + if (res->entry.haspos) + { + *(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos)); + *outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos); + } + *outbuflen += SHORTALIGN(res->entry.len); + res++; + memcpy(res, ptr, sizeof(WordEntryIN)); + } + else if (ptr->entry.haspos) + { + if (res->entry.haspos) + { + int4 len = *(uint16 *) (ptr->pos) + 1 + *(uint16 *) (res->pos); + + res->pos = (WordEntryPos *) repalloc(res->pos, len * sizeof(WordEntryPos)); + memcpy(&(res->pos[*(uint16 *) (res->pos) + 1]), + &(ptr->pos[1]), *(uint16 *) (ptr->pos) * sizeof(WordEntryPos)); + *(uint16 *) (res->pos) += *(uint16 *) (ptr->pos); + pfree(ptr->pos); + } + else + { + res->entry.haspos = 1; + res->pos = ptr->pos; + } + } + ptr++; + } + if (res->entry.haspos) + { + *(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos)); + *outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos); + } + *outbuflen += SHORTALIGN(res->entry.len); + + return res + 1 - a; +} + +static int +WordEntryCMP(WordEntry * a, WordEntry * b, char *buf) +{ + return compareentry(a, b, buf); +} + +#define WAITWORD 1 +#define WAITENDWORD 2 +#define WAITNEXTCHAR 3 +#define WAITENDCMPLX 4 +#define WAITPOSINFO 5 +#define INPOSINFO 6 +#define WAITPOSDELIM 7 +#define WAITCHARCMPLX 8 + +#define RESIZEPRSBUF \ +do { \ + if ( state->curpos - state->word + pg_database_encoding_max_length() >= state->len ) \ + { \ + int4 clen = state->curpos - state->word; \ + state->len *= 2; \ + state->word = (char*)repalloc( (void*)state->word, state->len ); \ + state->curpos = state->word + clen; \ + } \ +} while (0) + +bool +gettoken_tsvector(TSVectorParseState *state) +{ + int4 oldstate = 0; + + state->curpos = state->word; + state->state = WAITWORD; + state->alen = 0; + + while (1) + { + if (state->state == WAITWORD) + { + if (*(state->prsbuf) == '\0') + return false; + else if (t_iseq(state->prsbuf, '\'')) + state->state = WAITENDCMPLX; + else if (t_iseq(state->prsbuf, '\\')) + { + state->state = WAITNEXTCHAR; + oldstate = WAITENDWORD; + } + else if (state->oprisdelim && ISOPERATOR(state->prsbuf)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + else if (!t_isspace(state->prsbuf)) + { + COPYCHAR(state->curpos, state->prsbuf); + state->curpos += pg_mblen(state->prsbuf); + state->state = WAITENDWORD; + } + } + else if (state->state == WAITNEXTCHAR) + { + if (*(state->prsbuf) == '\0') + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("there is no escaped character"))); + else + { + RESIZEPRSBUF; + COPYCHAR(state->curpos, state->prsbuf); + state->curpos += pg_mblen(state->prsbuf); + state->state = oldstate; + } + } + else if (state->state == WAITENDWORD) + { + if (t_iseq(state->prsbuf, '\\')) + { + state->state = WAITNEXTCHAR; + oldstate = WAITENDWORD; + } + else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' || + (state->oprisdelim && ISOPERATOR(state->prsbuf))) + { + RESIZEPRSBUF; + if (state->curpos == state->word) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + *(state->curpos) = '\0'; + return true; + } + else if (t_iseq(state->prsbuf, ':')) + { + if (state->curpos == state->word) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + *(state->curpos) = '\0'; + if (state->oprisdelim) + return true; + else + state->state = INPOSINFO; + } + else + { + RESIZEPRSBUF; + COPYCHAR(state->curpos, state->prsbuf); + state->curpos += pg_mblen(state->prsbuf); + } + } + else if (state->state == WAITENDCMPLX) + { + if (t_iseq(state->prsbuf, '\'')) + { + state->state = WAITCHARCMPLX; + } + else if (t_iseq(state->prsbuf, '\\')) + { + state->state = WAITNEXTCHAR; + oldstate = WAITENDCMPLX; + } + else if (*(state->prsbuf) == '\0') + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + else + { + RESIZEPRSBUF; + COPYCHAR(state->curpos, state->prsbuf); + state->curpos += pg_mblen(state->prsbuf); + } + } + else if (state->state == WAITCHARCMPLX) + { + if (t_iseq(state->prsbuf, '\'')) + { + RESIZEPRSBUF; + COPYCHAR(state->curpos, state->prsbuf); + state->curpos += pg_mblen(state->prsbuf); + state->state = WAITENDCMPLX; + } + else + { + RESIZEPRSBUF; + *(state->curpos) = '\0'; + if (state->curpos == state->word) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + if (state->oprisdelim) + { + /* state->prsbuf+=pg_mblen(state->prsbuf); */ + return true; + } + else + state->state = WAITPOSINFO; + continue; /* recheck current character */ + } + } + else if (state->state == WAITPOSINFO) + { + if (t_iseq(state->prsbuf, ':')) + state->state = INPOSINFO; + else + return true; + } + else if (state->state == INPOSINFO) + { + if (t_isdigit(state->prsbuf)) + { + if (state->alen == 0) + { + state->alen = 4; + state->pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * state->alen); + *(uint16 *) (state->pos) = 0; + } + else if (*(uint16 *) (state->pos) + 1 >= state->alen) + { + state->alen *= 2; + state->pos = (WordEntryPos *) repalloc(state->pos, sizeof(WordEntryPos) * state->alen); + } + (*(uint16 *) (state->pos))++; + WEP_SETPOS(state->pos[*(uint16 *) (state->pos)], LIMITPOS(atoi(state->prsbuf))); + if (WEP_GETPOS(state->pos[*(uint16 *) (state->pos)]) == 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("wrong position info in tsvector"))); + WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0); + state->state = WAITPOSDELIM; + } + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + } + else if (state->state == WAITPOSDELIM) + { + if (t_iseq(state->prsbuf, ',')) + state->state = INPOSINFO; + else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*')) + { + if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 3); + } + else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B')) + { + if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 2); + } + else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C')) + { + if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 1); + } + else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D')) + { + if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0); + } + else if (t_isspace(state->prsbuf) || + *(state->prsbuf) == '\0') + return true; + else if (!t_isdigit(state->prsbuf)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + } + else /* internal error */ + elog(ERROR, "internal error in gettoken_tsvector"); + + /* get next char */ + state->prsbuf += pg_mblen(state->prsbuf); + } + + return false; +} + +Datum +tsvectorin(PG_FUNCTION_ARGS) +{ + char *buf = PG_GETARG_CSTRING(0); + TSVectorParseState state; + WordEntryIN *arr; + WordEntry *inarr; + int4 len = 0, + totallen = 64; + TSVector in; + char *tmpbuf, + *cur; + int4 i, + buflen = 256; + + pg_verifymbstr(buf, strlen(buf), false); + state.prsbuf = buf; + state.len = 32; + state.word = (char *) palloc(state.len); + state.oprisdelim = false; + + arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * totallen); + cur = tmpbuf = (char *) palloc(buflen); + + while (gettoken_tsvector(&state)) + { + /* + * Realloc buffers if it's needed + */ + if (len >= totallen) + { + totallen *= 2; + arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * totallen); + } + + while ((cur - tmpbuf) + (state.curpos - state.word) >= buflen) + { + int4 dist = cur - tmpbuf; + + buflen *= 2; + tmpbuf = (char *) repalloc((void *) tmpbuf, buflen); + cur = tmpbuf + dist; + } + + if (state.curpos - state.word >= MAXSTRLEN) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("word is too long (%d bytes, max %d bytes)", + state.curpos - state.word, MAXSTRLEN))); + + arr[len].entry.len = state.curpos - state.word; + if (cur - tmpbuf > MAXSTRPOS) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("position value too large"))); + arr[len].entry.pos = cur - tmpbuf; + memcpy((void *) cur, (void *) state.word, arr[len].entry.len); + cur += arr[len].entry.len; + + if (state.alen) + { + arr[len].entry.haspos = 1; + arr[len].pos = state.pos; + } + else + arr[len].entry.haspos = 0; + len++; + } + pfree(state.word); + + if (len > 0) + len = uniqueentry(arr, len, tmpbuf, &buflen); + else + buflen = 0; + totallen = CALCDATASIZE(len, buflen); + in = (TSVector) palloc0(totallen); + + SET_VARSIZE(in, totallen); + in->size = len; + cur = STRPTR(in); + inarr = ARRPTR(in); + for (i = 0; i < len; i++) + { + memcpy((void *) cur, (void *) &tmpbuf[arr[i].entry.pos], arr[i].entry.len); + arr[i].entry.pos = cur - STRPTR(in); + cur += SHORTALIGN(arr[i].entry.len); + if (arr[i].entry.haspos) + { + memcpy(cur, arr[i].pos, (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos)); + cur += (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos); + pfree(arr[i].pos); + } + inarr[i] = arr[i].entry; + } + + PG_RETURN_TSVECTOR(in); +} + +Datum +tsvectorout(PG_FUNCTION_ARGS) +{ + TSVector out = PG_GETARG_TSVECTOR(0); + char *outbuf; + int4 i, + lenbuf = 0, + pp; + WordEntry *ptr = ARRPTR(out); + char *curbegin, + *curin, + *curout; + + lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; + for (i = 0; i < out->size; i++) + { + lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ; + if (ptr[i].haspos) + lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i])); + } + + curout = outbuf = (char *) palloc(lenbuf); + for (i = 0; i < out->size; i++) + { + curbegin = curin = STRPTR(out) + ptr->pos; + if (i != 0) + *curout++ = ' '; + *curout++ = '\''; + while (curin - curbegin < ptr->len) + { + int len = pg_mblen(curin); + + if (t_iseq(curin, '\'')) + *curout++ = '\''; + + while (len--) + *curout++ = *curin++; + } + + *curout++ = '\''; + if ((pp = POSDATALEN(out, ptr)) != 0) + { + WordEntryPos *wptr; + + *curout++ = ':'; + wptr = POSDATAPTR(out, ptr); + while (pp) + { + curout += sprintf(curout, "%d", WEP_GETPOS(*wptr)); + switch (WEP_GETWEIGHT(*wptr)) + { + case 3: + *curout++ = 'A'; + break; + case 2: + *curout++ = 'B'; + break; + case 1: + *curout++ = 'C'; + break; + case 0: + default: + break; + } + + if (pp > 1) + *curout++ = ','; + pp--; + wptr++; + } + } + ptr++; + } + + *curout = '\0'; + PG_FREE_IF_COPY(out, 0); + PG_RETURN_CSTRING(outbuf); +} + +Datum +tsvectorsend(PG_FUNCTION_ARGS) +{ + TSVector vec = PG_GETARG_TSVECTOR(0); + StringInfoData buf; + int i, + j; + WordEntry *weptr = ARRPTR(vec); + + pq_begintypsend(&buf); + + pq_sendint(&buf, vec->size, sizeof(int32)); + for (i = 0; i < vec->size; i++) + { + /* + * We are sure that sizeof(WordEntry) == sizeof(int32) + */ + pq_sendint(&buf, *(int32 *) weptr, sizeof(int32)); + + pq_sendbytes(&buf, STRPTR(vec) + weptr->pos, weptr->len); + if (weptr->haspos) + { + WordEntryPos *wepptr = POSDATAPTR(vec, weptr); + + pq_sendint(&buf, POSDATALEN(vec, weptr), sizeof(WordEntryPos)); + for (j = 0; j < POSDATALEN(vec, weptr); j++) + pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos)); + } + weptr++; + } + + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); +} + +Datum +tsvectorrecv(PG_FUNCTION_ARGS) +{ + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + TSVector vec; + int i, + size, + len = DATAHDRSIZE; + WordEntry *weptr; + int datalen = 0; + + size = pq_getmsgint(buf, sizeof(uint32)); + if (size < 0 || size > (MaxAllocSize / sizeof(WordEntry))) + elog(ERROR, "invalid size of tsvector"); + + len += sizeof(WordEntry) * size; + + len *= 2; + vec = (TSVector) palloc0(len); + vec->size = size; + + weptr = ARRPTR(vec); + for (i = 0; i < size; i++) + { + int tmp; + + weptr = ARRPTR(vec) + i; + + /* + * We are sure that sizeof(WordEntry) == sizeof(int32) + */ + tmp = pq_getmsgint(buf, sizeof(int32)); + *weptr = *(WordEntry *) & tmp; + + while (CALCDATASIZE(size, datalen + SHORTALIGN(weptr->len)) >= len) + { + len *= 2; + vec = (TSVector) repalloc(vec, len); + weptr = ARRPTR(vec) + i; + } + + memcpy(STRPTR(vec) + weptr->pos, + pq_getmsgbytes(buf, weptr->len), + weptr->len); + datalen += SHORTALIGN(weptr->len); + + if (i > 0 && WordEntryCMP(weptr, weptr - 1, STRPTR(vec)) <= 0) + elog(ERROR, "lexemes are unordered"); + + if (weptr->haspos) + { + uint16 j, + npos; + WordEntryPos *wepptr; + + npos = (uint16) pq_getmsgint(buf, sizeof(int16)); + if (npos > MAXNUMPOS) + elog(ERROR, "unexpected number of positions"); + + while (CALCDATASIZE(size, datalen + (npos + 1) * sizeof(WordEntryPos)) >= len) + { + len *= 2; + vec = (TSVector) repalloc(vec, len); + weptr = ARRPTR(vec) + i; + } + + memcpy(_POSDATAPTR(vec, weptr), &npos, sizeof(int16)); + wepptr = POSDATAPTR(vec, weptr); + for (j = 0; j < npos; j++) + { + wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(int16)); + if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1])) + elog(ERROR, "position information is unordered"); + } + + datalen += (npos + 1) * sizeof(WordEntry); + } + } + + SET_VARSIZE(vec, CALCDATASIZE(vec->size, datalen)); + + PG_RETURN_TSVECTOR(vec); +} diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c new file mode 100644 index 00000000000..341247bec75 --- /dev/null +++ b/src/backend/utils/adt/tsvector_op.c @@ -0,0 +1,1334 @@ +/*------------------------------------------------------------------------- + * + * tsvector_op.c + * operations over tsvector + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "catalog/namespace.h" +#include "commands/trigger.h" +#include "executor/spi.h" +#include "funcapi.h" +#include "mb/pg_wchar.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" + + +typedef struct +{ + WordEntry *arrb; + WordEntry *arre; + char *values; + char *operand; +} CHKVAL; + +typedef struct +{ + uint32 cur; + TSVector stat; +} StatStorage; + +typedef struct +{ + uint32 len; + uint32 pos; + uint32 ndoc; + uint32 nentry; +} StatEntry; + +typedef struct +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int4 size; + int4 weight; + char data[1]; +} tsstat; + +#define STATHDRSIZE (sizeof(int4) * 4) +#define CALCSTATSIZE(x, lenstr) ( (x) * sizeof(StatEntry) + STATHDRSIZE + (lenstr) ) +#define STATPTR(x) ( (StatEntry*) ( (char*)(x) + STATHDRSIZE ) ) +#define STATSTRPTR(x) ( (char*)(x) + STATHDRSIZE + ( sizeof(StatEntry) * ((TSVector)(x))->size ) ) +#define STATSTRSIZE(x) ( VARSIZE((TSVector)(x)) - STATHDRSIZE - ( sizeof(StatEntry) * ((TSVector)(x))->size ) ) + + +static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column); + + +static int +silly_cmp_tsvector(const TSVector a, const TSVector b) +{ + if (VARSIZE(a) < VARSIZE(b)) + return -1; + else if (VARSIZE(a) > VARSIZE(b)) + return 1; + else if (a->size < b->size) + return -1; + else if (a->size > b->size) + return 1; + else + { + WordEntry *aptr = ARRPTR(a); + WordEntry *bptr = ARRPTR(b); + int i = 0; + int res; + + + for (i = 0; i < a->size; i++) + { + if (aptr->haspos != bptr->haspos) + { + return (aptr->haspos > bptr->haspos) ? -1 : 1; + } + else if (aptr->len != bptr->len) + { + return (aptr->len > bptr->len) ? -1 : 1; + } + else if ((res = strncmp(STRPTR(a) + aptr->pos, STRPTR(b) + bptr->pos, bptr->len)) != 0) + { + return res; + } + else if (aptr->haspos) + { + WordEntryPos *ap = POSDATAPTR(a, aptr); + WordEntryPos *bp = POSDATAPTR(b, bptr); + int j; + + if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr)) + return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1; + + for (j = 0; j < POSDATALEN(a, aptr); j++) + { + if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp)) + { + return (WEP_GETPOS(*ap) > WEP_GETPOS(*bp)) ? -1 : 1; + } + else if (WEP_GETWEIGHT(*ap) != WEP_GETWEIGHT(*bp)) + { + return (WEP_GETWEIGHT(*ap) > WEP_GETWEIGHT(*bp)) ? -1 : 1; + } + ap++, bp++; + } + } + + aptr++; + bptr++; + } + } + + return 0; +} + +#define TSVECTORCMPFUNC( type, action, ret ) \ +Datum \ +tsvector_##type(PG_FUNCTION_ARGS) \ +{ \ + TSVector a = PG_GETARG_TSVECTOR(0); \ + TSVector b = PG_GETARG_TSVECTOR(1); \ + int res = silly_cmp_tsvector(a, b); \ + PG_FREE_IF_COPY(a,0); \ + PG_FREE_IF_COPY(b,1); \ + PG_RETURN_##ret( res action 0 ); \ +} + +TSVECTORCMPFUNC(lt, <, BOOL); +TSVECTORCMPFUNC(le, <=, BOOL); +TSVECTORCMPFUNC(eq, ==, BOOL); +TSVECTORCMPFUNC(ge, >=, BOOL); +TSVECTORCMPFUNC(gt, >, BOOL); +TSVECTORCMPFUNC(ne, !=, BOOL); +TSVECTORCMPFUNC(cmp, +, INT32); + +Datum +tsvector_strip(PG_FUNCTION_ARGS) +{ + TSVector in = PG_GETARG_TSVECTOR(0); + TSVector out; + int i, + len = 0; + WordEntry *arrin = ARRPTR(in), + *arrout; + char *cur; + + for (i = 0; i < in->size; i++) + len += SHORTALIGN(arrin[i].len); + + len = CALCDATASIZE(in->size, len); + out = (TSVector) palloc0(len); + SET_VARSIZE(out, len); + out->size = in->size; + arrout = ARRPTR(out); + cur = STRPTR(out); + for (i = 0; i < in->size; i++) + { + memcpy(cur, STRPTR(in) + arrin[i].pos, arrin[i].len); + arrout[i].haspos = 0; + arrout[i].len = arrin[i].len; + arrout[i].pos = cur - STRPTR(out); + cur += SHORTALIGN(arrout[i].len); + } + + PG_FREE_IF_COPY(in, 0); + PG_RETURN_POINTER(out); +} + +Datum +tsvector_length(PG_FUNCTION_ARGS) +{ + TSVector in = PG_GETARG_TSVECTOR(0); + int4 ret = in->size; + + PG_FREE_IF_COPY(in, 0); + PG_RETURN_INT32(ret); +} + +Datum +tsvector_setweight(PG_FUNCTION_ARGS) +{ + TSVector in = PG_GETARG_TSVECTOR(0); + char cw = PG_GETARG_CHAR(1); + TSVector out; + int i, + j; + WordEntry *entry; + WordEntryPos *p; + int w = 0; + + switch (cw) + { + case 'A': + case 'a': + w = 3; + break; + case 'B': + case 'b': + w = 2; + break; + case 'C': + case 'c': + w = 1; + break; + case 'D': + case 'd': + w = 0; + break; + /* internal error */ + default: + elog(ERROR, "unrecognized weight"); + } + + out = (TSVector) palloc(VARSIZE(in)); + memcpy(out, in, VARSIZE(in)); + entry = ARRPTR(out); + i = out->size; + while (i--) + { + if ((j = POSDATALEN(out, entry)) != 0) + { + p = POSDATAPTR(out, entry); + while (j--) + { + WEP_SETWEIGHT(*p, w); + p++; + } + } + entry++; + } + + PG_FREE_IF_COPY(in, 0); + PG_RETURN_POINTER(out); +} + +static int +compareEntry(char *ptra, WordEntry * a, char *ptrb, WordEntry * b) +{ + if (a->len == b->len) + { + return strncmp( + ptra + a->pos, + ptrb + b->pos, + a->len); + } + return (a->len > b->len) ? 1 : -1; +} + +static int4 +add_pos(TSVector src, WordEntry * srcptr, TSVector dest, WordEntry * destptr, int4 maxpos) +{ + uint16 *clen = (uint16 *) _POSDATAPTR(dest, destptr); + int i; + uint16 slen = POSDATALEN(src, srcptr), + startlen; + WordEntryPos *spos = POSDATAPTR(src, srcptr), + *dpos = POSDATAPTR(dest, destptr); + + if (!destptr->haspos) + *clen = 0; + + startlen = *clen; + for (i = 0; i < slen && *clen < MAXNUMPOS && (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1); i++) + { + WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i])); + WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos)); + (*clen)++; + } + + if (*clen != startlen) + destptr->haspos = 1; + return *clen - startlen; +} + + +Datum +tsvector_concat(PG_FUNCTION_ARGS) +{ + TSVector in1 = PG_GETARG_TSVECTOR(0); + TSVector in2 = PG_GETARG_TSVECTOR(1); + TSVector out; + WordEntry *ptr; + WordEntry *ptr1, + *ptr2; + WordEntryPos *p; + int maxpos = 0, + i, + j, + i1, + i2; + char *cur; + char *data, + *data1, + *data2; + + ptr = ARRPTR(in1); + i = in1->size; + while (i--) + { + if ((j = POSDATALEN(in1, ptr)) != 0) + { + p = POSDATAPTR(in1, ptr); + while (j--) + { + if (WEP_GETPOS(*p) > maxpos) + maxpos = WEP_GETPOS(*p); + p++; + } + } + ptr++; + } + + ptr1 = ARRPTR(in1); + ptr2 = ARRPTR(in2); + data1 = STRPTR(in1); + data2 = STRPTR(in2); + i1 = in1->size; + i2 = in2->size; + out = (TSVector) palloc0(VARSIZE(in1) + VARSIZE(in2)); + SET_VARSIZE(out, VARSIZE(in1) + VARSIZE(in2)); + out->size = in1->size + in2->size; + data = cur = STRPTR(out); + ptr = ARRPTR(out); + while (i1 && i2) + { + int cmp = compareEntry(data1, ptr1, data2, ptr2); + + if (cmp < 0) + { /* in1 first */ + ptr->haspos = ptr1->haspos; + ptr->len = ptr1->len; + memcpy(cur, data1 + ptr1->pos, ptr1->len); + ptr->pos = cur - data; + cur += SHORTALIGN(ptr1->len); + if (ptr->haspos) + { + memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); + cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); + } + ptr++; + ptr1++; + i1--; + } + else if (cmp > 0) + { /* in2 first */ + ptr->haspos = ptr2->haspos; + ptr->len = ptr2->len; + memcpy(cur, data2 + ptr2->pos, ptr2->len); + ptr->pos = cur - data; + cur += SHORTALIGN(ptr2->len); + if (ptr->haspos) + { + int addlen = add_pos(in2, ptr2, out, ptr, maxpos); + + if (addlen == 0) + ptr->haspos = 0; + else + cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); + } + ptr++; + ptr2++; + i2--; + } + else + { + ptr->haspos = ptr1->haspos | ptr2->haspos; + ptr->len = ptr1->len; + memcpy(cur, data1 + ptr1->pos, ptr1->len); + ptr->pos = cur - data; + cur += SHORTALIGN(ptr1->len); + if (ptr->haspos) + { + if (ptr1->haspos) + { + memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); + cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); + if (ptr2->haspos) + cur += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos); + } + else if (ptr2->haspos) + { + int addlen = add_pos(in2, ptr2, out, ptr, maxpos); + + if (addlen == 0) + ptr->haspos = 0; + else + cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); + } + } + ptr++; + ptr1++; + ptr2++; + i1--; + i2--; + } + } + + while (i1) + { + ptr->haspos = ptr1->haspos; + ptr->len = ptr1->len; + memcpy(cur, data1 + ptr1->pos, ptr1->len); + ptr->pos = cur - data; + cur += SHORTALIGN(ptr1->len); + if (ptr->haspos) + { + memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); + cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); + } + ptr++; + ptr1++; + i1--; + } + + while (i2) + { + ptr->haspos = ptr2->haspos; + ptr->len = ptr2->len; + memcpy(cur, data2 + ptr2->pos, ptr2->len); + ptr->pos = cur - data; + cur += SHORTALIGN(ptr2->len); + if (ptr->haspos) + { + int addlen = add_pos(in2, ptr2, out, ptr, maxpos); + + if (addlen == 0) + ptr->haspos = 0; + else + cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); + } + ptr++; + ptr2++; + i2--; + } + + out->size = ptr - ARRPTR(out); + SET_VARSIZE(out, CALCDATASIZE(out->size, cur - data)); + if (data != STRPTR(out)) + memmove(STRPTR(out), data, cur - data); + + PG_FREE_IF_COPY(in1, 0); + PG_FREE_IF_COPY(in2, 1); + PG_RETURN_POINTER(out); +} + +/* + * compare 2 string values + */ +static int4 +ValCompare(CHKVAL * chkval, WordEntry * ptr, QueryItem * item) +{ + if (ptr->len == item->length) + return strncmp( + &(chkval->values[ptr->pos]), + &(chkval->operand[item->distance]), + item->length); + + return (ptr->len > item->length) ? 1 : -1; +} + +/* + * check weight info + */ +static bool +checkclass_str(CHKVAL * chkval, WordEntry * val, QueryItem * item) +{ + WordEntryPos *ptr = (WordEntryPos *) (chkval->values + val->pos + SHORTALIGN(val->len) + sizeof(uint16)); + uint16 len = *((uint16 *) (chkval->values + val->pos + SHORTALIGN(val->len))); + + while (len--) + { + if (item->weight & (1 << WEP_GETWEIGHT(*ptr))) + return true; + ptr++; + } + return false; +} + +/* + * is there value 'val' in array or not ? + */ +static bool +checkcondition_str(void *checkval, QueryItem * val) +{ + WordEntry *StopLow = ((CHKVAL *) checkval)->arrb; + WordEntry *StopHigh = ((CHKVAL *) checkval)->arre; + WordEntry *StopMiddle; + int difference; + + /* Loop invariant: StopLow <= val < StopHigh */ + + while (StopLow < StopHigh) + { + StopMiddle = StopLow + (StopHigh - StopLow) / 2; + difference = ValCompare((CHKVAL *) checkval, StopMiddle, val); + if (difference == 0) + return (val->weight && StopMiddle->haspos) ? + checkclass_str((CHKVAL *) checkval, StopMiddle, val) : true; + else if (difference < 0) + StopLow = StopMiddle + 1; + else + StopHigh = StopMiddle; + } + + return (false); +} + +/* + * check for boolean condition + */ +bool +TS_execute(QueryItem * curitem, void *checkval, bool calcnot, bool (*chkcond) (void *checkval, QueryItem * val)) +{ + if (curitem->type == VAL) + return chkcond(checkval, curitem); + else if (curitem->val == (int4) '!') + { + return (calcnot) ? + ((TS_execute(curitem + 1, checkval, calcnot, chkcond)) ? false : true) + : true; + } + else if (curitem->val == (int4) '&') + { + if (TS_execute(curitem + curitem->left, checkval, calcnot, chkcond)) + return TS_execute(curitem + 1, checkval, calcnot, chkcond); + else + return false; + } + else + { /* |-operator */ + if (TS_execute(curitem + curitem->left, checkval, calcnot, chkcond)) + return true; + else + return TS_execute(curitem + 1, checkval, calcnot, chkcond); + } + return false; +} + +/* + * boolean operations + */ +Datum +ts_match_qv(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall2(ts_match_vq, + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(0))); +} + +Datum +ts_match_vq(PG_FUNCTION_ARGS) +{ + TSVector val = PG_GETARG_TSVECTOR(0); + TSQuery query = PG_GETARG_TSQUERY(1); + CHKVAL chkval; + bool result; + + if (!val->size || !query->size) + { + PG_FREE_IF_COPY(val, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_BOOL(false); + } + + chkval.arrb = ARRPTR(val); + chkval.arre = chkval.arrb + val->size; + chkval.values = STRPTR(val); + chkval.operand = GETOPERAND(query); + result = TS_execute( + GETQUERY(query), + &chkval, + true, + checkcondition_str + ); + + PG_FREE_IF_COPY(val, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_BOOL(result); +} + +Datum +ts_match_tt(PG_FUNCTION_ARGS) +{ + TSVector vector; + TSQuery query; + bool res; + + vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector, + PG_GETARG_DATUM(0))); + query = DatumGetTSQuery(DirectFunctionCall1(plainto_tsquery, + PG_GETARG_DATUM(1))); + + res = DatumGetBool(DirectFunctionCall2(ts_match_vq, + TSVectorGetDatum(vector), + TSQueryGetDatum(query))); + + pfree(vector); + pfree(query); + + PG_RETURN_BOOL(res); +} + +Datum +ts_match_tq(PG_FUNCTION_ARGS) +{ + TSVector vector; + TSQuery query = PG_GETARG_TSQUERY(1); + bool res; + + vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector, + PG_GETARG_DATUM(0))); + + res = DatumGetBool(DirectFunctionCall2(ts_match_vq, + TSVectorGetDatum(vector), + TSQueryGetDatum(query))); + + pfree(vector); + PG_FREE_IF_COPY(query, 1); + + PG_RETURN_BOOL(res); +} + +/* + * Statistics of tsvector + */ +static int +check_weight(TSVector txt, WordEntry * wptr, int8 weight) +{ + int len = POSDATALEN(txt, wptr); + int num = 0; + WordEntryPos *ptr = POSDATAPTR(txt, wptr); + + while (len--) + { + if (weight & (1 << WEP_GETWEIGHT(*ptr))) + num++; + ptr++; + } + return num; +} + +static WordEntry ** +SEI_realloc(WordEntry ** in, uint32 *len) +{ + if (*len == 0 || in == NULL) + { + *len = 8; + in = palloc(sizeof(WordEntry *) * (*len)); + } + else + { + *len *= 2; + in = repalloc(in, sizeof(WordEntry *) * (*len)); + } + return in; +} + +static int +compareStatWord(StatEntry * a, WordEntry * b, tsstat * stat, TSVector txt) +{ + if (a->len == b->len) + return strncmp( + STATSTRPTR(stat) + a->pos, + STRPTR(txt) + b->pos, + a->len + ); + return (a->len > b->len) ? 1 : -1; +} + +static tsstat * +formstat(tsstat * stat, TSVector txt, WordEntry ** entry, uint32 len) +{ + tsstat *newstat; + uint32 totallen, + nentry; + uint32 slen = 0; + WordEntry **ptr = entry; + char *curptr; + StatEntry *sptr, + *nptr; + + while (ptr - entry < len) + { + slen += (*ptr)->len; + ptr++; + } + + nentry = stat->size + len; + slen += STATSTRSIZE(stat); + totallen = CALCSTATSIZE(nentry, slen); + newstat = palloc(totallen); + SET_VARSIZE(newstat, totallen); + newstat->weight = stat->weight; + newstat->size = nentry; + + memcpy(STATSTRPTR(newstat), STATSTRPTR(stat), STATSTRSIZE(stat)); + curptr = STATSTRPTR(newstat) + STATSTRSIZE(stat); + + ptr = entry; + sptr = STATPTR(stat); + nptr = STATPTR(newstat); + + if (len == 1) + { + StatEntry *StopLow = STATPTR(stat); + StatEntry *StopHigh = (StatEntry *) STATSTRPTR(stat); + + while (StopLow < StopHigh) + { + sptr = StopLow + (StopHigh - StopLow) / 2; + if (compareStatWord(sptr, *ptr, stat, txt) < 0) + StopLow = sptr + 1; + else + StopHigh = sptr; + } + nptr = STATPTR(newstat) + (StopLow - STATPTR(stat)); + memcpy(STATPTR(newstat), STATPTR(stat), sizeof(StatEntry) * (StopLow - STATPTR(stat))); + if ((*ptr)->haspos) + nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr); + else + nptr->nentry = 1; + nptr->ndoc = 1; + nptr->len = (*ptr)->len; + memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len); + nptr->pos = curptr - STATSTRPTR(newstat); + memcpy(nptr + 1, StopLow, sizeof(StatEntry) * (((StatEntry *) STATSTRPTR(stat)) - StopLow)); + } + else + { + while (sptr - STATPTR(stat) < stat->size && ptr - entry < len) + { + if (compareStatWord(sptr, *ptr, stat, txt) < 0) + { + memcpy(nptr, sptr, sizeof(StatEntry)); + sptr++; + } + else + { + if ((*ptr)->haspos) + nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr); + else + nptr->nentry = 1; + nptr->ndoc = 1; + nptr->len = (*ptr)->len; + memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len); + nptr->pos = curptr - STATSTRPTR(newstat); + curptr += nptr->len; + ptr++; + } + nptr++; + } + + memcpy(nptr, sptr, sizeof(StatEntry) * (stat->size - (sptr - STATPTR(stat)))); + + while (ptr - entry < len) + { + if ((*ptr)->haspos) + nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr); + else + nptr->nentry = 1; + nptr->ndoc = 1; + nptr->len = (*ptr)->len; + memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len); + nptr->pos = curptr - STATSTRPTR(newstat); + curptr += nptr->len; + ptr++; + nptr++; + } + } + + return newstat; +} + +static tsstat * +ts_accum(tsstat * stat, Datum data) +{ + tsstat *newstat; + TSVector txt = DatumGetTSVector(data); + WordEntry **newentry = NULL; + uint32 len = 0, + cur = 0; + StatEntry *sptr; + WordEntry *wptr; + int n = 0; + + if (stat == NULL) + { /* Init in first */ + stat = palloc(STATHDRSIZE); + SET_VARSIZE(stat, STATHDRSIZE); + stat->size = 0; + stat->weight = 0; + } + + /* simple check of correctness */ + if (txt == NULL || txt->size == 0) + { + if (txt != (TSVector) DatumGetPointer(data)) + pfree(txt); + return stat; + } + + sptr = STATPTR(stat); + wptr = ARRPTR(txt); + + if (stat->size < 100 * txt->size) + { /* merge */ + while (sptr - STATPTR(stat) < stat->size && wptr - ARRPTR(txt) < txt->size) + { + int cmp = compareStatWord(sptr, wptr, stat, txt); + + if (cmp < 0) + sptr++; + else if (cmp == 0) + { + if (stat->weight == 0) + { + sptr->ndoc++; + sptr->nentry += (wptr->haspos) ? POSDATALEN(txt, wptr) : 1; + } + else if (wptr->haspos && (n = check_weight(txt, wptr, stat->weight)) != 0) + { + sptr->ndoc++; + sptr->nentry += n; + } + sptr++; + wptr++; + } + else + { + if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0) + { + if (cur == len) + newentry = SEI_realloc(newentry, &len); + newentry[cur] = wptr; + cur++; + } + wptr++; + } + } + + while (wptr - ARRPTR(txt) < txt->size) + { + if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0) + { + if (cur == len) + newentry = SEI_realloc(newentry, &len); + newentry[cur] = wptr; + cur++; + } + wptr++; + } + } + else + { /* search */ + while (wptr - ARRPTR(txt) < txt->size) + { + StatEntry *StopLow = STATPTR(stat); + StatEntry *StopHigh = (StatEntry *) STATSTRPTR(stat); + int cmp; + + while (StopLow < StopHigh) + { + sptr = StopLow + (StopHigh - StopLow) / 2; + cmp = compareStatWord(sptr, wptr, stat, txt); + if (cmp == 0) + { + if (stat->weight == 0) + { + sptr->ndoc++; + sptr->nentry += (wptr->haspos) ? POSDATALEN(txt, wptr) : 1; + } + else if (wptr->haspos && (n = check_weight(txt, wptr, stat->weight)) != 0) + { + sptr->ndoc++; + sptr->nentry += n; + } + break; + } + else if (cmp < 0) + StopLow = sptr + 1; + else + StopHigh = sptr; + } + + if (StopLow >= StopHigh) + { /* not found */ + if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0) + { + if (cur == len) + newentry = SEI_realloc(newentry, &len); + newentry[cur] = wptr; + cur++; + } + } + wptr++; + } + } + + + if (cur == 0) + { /* no new words */ + if (txt != (TSVector) DatumGetPointer(data)) + pfree(txt); + return stat; + } + + newstat = formstat(stat, txt, newentry, cur); + pfree(newentry); + + if (txt != (TSVector) DatumGetPointer(data)) + pfree(txt); + return newstat; +} + +static void +ts_setup_firstcall(FunctionCallInfo fcinfo, FuncCallContext *funcctx, + tsstat * stat) +{ + TupleDesc tupdesc; + MemoryContext oldcontext; + StatStorage *st; + + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + st = palloc(sizeof(StatStorage)); + st->cur = 0; + st->stat = palloc(VARSIZE(stat)); + memcpy(st->stat, stat, VARSIZE(stat)); + funcctx->user_fctx = (void *) st; + + tupdesc = CreateTemplateTupleDesc(3, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "word", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "ndoc", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "nentry", + INT4OID, -1, 0); + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc); + + MemoryContextSwitchTo(oldcontext); +} + + +static Datum +ts_process_call(FuncCallContext *funcctx) +{ + StatStorage *st; + + st = (StatStorage *) funcctx->user_fctx; + + if (st->cur < st->stat->size) + { + Datum result; + char *values[3]; + char ndoc[16]; + char nentry[16]; + StatEntry *entry = STATPTR(st->stat) + st->cur; + HeapTuple tuple; + + values[0] = palloc(entry->len + 1); + memcpy(values[0], STATSTRPTR(st->stat) + entry->pos, entry->len); + (values[0])[entry->len] = '\0'; + sprintf(ndoc, "%d", entry->ndoc); + values[1] = ndoc; + sprintf(nentry, "%d", entry->nentry); + values[2] = nentry; + + tuple = BuildTupleFromCStrings(funcctx->attinmeta, values); + result = HeapTupleGetDatum(tuple); + + pfree(values[0]); + st->cur++; + return result; + } + else + { + pfree(st->stat); + pfree(st); + } + + return (Datum) 0; +} + +static tsstat * +ts_stat_sql(text *txt, text *ws) +{ + char *query = TextPGetCString(txt); + int i; + tsstat *newstat, + *stat; + bool isnull; + Portal portal; + void *plan; + + if ((plan = SPI_prepare(query, 0, NULL)) == NULL) + /* internal error */ + elog(ERROR, "SPI_prepare(\"%s\") failed", query); + + if ((portal = SPI_cursor_open(NULL, plan, NULL, NULL, false)) == NULL) + /* internal error */ + elog(ERROR, "SPI_cursor_open(\"%s\") failed", query); + + SPI_cursor_fetch(portal, true, 100); + + if (SPI_tuptable->tupdesc->natts != 1 || + SPI_gettypeid(SPI_tuptable->tupdesc, 1) != TSVECTOROID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("ts_stat query must return one tsvector column"))); + + stat = palloc(STATHDRSIZE); + SET_VARSIZE(stat, STATHDRSIZE); + stat->size = 0; + stat->weight = 0; + + if (ws) + { + char *buf; + + buf = VARDATA(ws); + while (buf - VARDATA(ws) < VARSIZE(ws) - VARHDRSZ) + { + if (pg_mblen(buf) == 1) + { + switch (*buf) + { + case 'A': + case 'a': + stat->weight |= 1 << 3; + break; + case 'B': + case 'b': + stat->weight |= 1 << 2; + break; + case 'C': + case 'c': + stat->weight |= 1 << 1; + break; + case 'D': + case 'd': + stat->weight |= 1; + break; + default: + stat->weight |= 0; + } + } + buf += pg_mblen(buf); + } + } + + while (SPI_processed > 0) + { + for (i = 0; i < SPI_processed; i++) + { + Datum data = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull); + + if (!isnull) + { + newstat = ts_accum(stat, data); + if (stat != newstat && stat) + pfree(stat); + stat = newstat; + } + } + + SPI_freetuptable(SPI_tuptable); + SPI_cursor_fetch(portal, true, 100); + } + + SPI_freetuptable(SPI_tuptable); + SPI_cursor_close(portal); + SPI_freeplan(plan); + pfree(query); + + return stat; +} + +Datum +ts_stat1(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + Datum result; + + if (SRF_IS_FIRSTCALL()) + { + tsstat *stat; + text *txt = PG_GETARG_TEXT_P(0); + + funcctx = SRF_FIRSTCALL_INIT(); + SPI_connect(); + stat = ts_stat_sql(txt, NULL); + PG_FREE_IF_COPY(txt, 0); + ts_setup_firstcall(fcinfo, funcctx, stat); + SPI_finish(); + } + + funcctx = SRF_PERCALL_SETUP(); + if ((result = ts_process_call(funcctx)) != (Datum) 0) + SRF_RETURN_NEXT(funcctx, result); + SRF_RETURN_DONE(funcctx); +} + +Datum +ts_stat2(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + Datum result; + + if (SRF_IS_FIRSTCALL()) + { + tsstat *stat; + text *txt = PG_GETARG_TEXT_P(0); + text *ws = PG_GETARG_TEXT_P(1); + + funcctx = SRF_FIRSTCALL_INIT(); + SPI_connect(); + stat = ts_stat_sql(txt, ws); + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(ws, 1); + ts_setup_firstcall(fcinfo, funcctx, stat); + SPI_finish(); + } + + funcctx = SRF_PERCALL_SETUP(); + if ((result = ts_process_call(funcctx)) != (Datum) 0) + SRF_RETURN_NEXT(funcctx, result); + SRF_RETURN_DONE(funcctx); +} + + +/* Check if datatype is TEXT or binary-equivalent to it */ +static bool +istexttype(Oid typid) +{ + /* varchar(n) and char(n) are binary-compatible with text */ + if (typid==TEXTOID || typid==VARCHAROID || typid==BPCHAROID) + return true; + /* Allow domains over these types, too */ + typid = getBaseType(typid); + if (typid==TEXTOID || typid==VARCHAROID || typid==BPCHAROID) + return true; + return false; +} + + +/* + * Triggers for automatic update of a tsvector column from text column(s) + * + * Trigger arguments are either + * name of tsvector col, name of tsconfig to use, name(s) of text col(s) + * name of tsvector col, name of regconfig col, name(s) of text col(s) + * ie, tsconfig can either be specified by name, or indirectly as the + * contents of a regconfig field in the row. If the name is used, it must + * be explicitly schema-qualified. + */ +Datum +tsvector_update_trigger_byid(PG_FUNCTION_ARGS) +{ + return tsvector_update_trigger(fcinfo, false); +} + +Datum +tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS) +{ + return tsvector_update_trigger(fcinfo, true); +} + +static Datum +tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column) +{ + TriggerData *trigdata; + Trigger *trigger; + Relation rel; + HeapTuple rettuple = NULL; + int tsvector_attr_num, + i; + ParsedText prs; + Datum datum; + bool isnull; + text *txt; + Oid cfgId; + + /* Check call context */ + if (!CALLED_AS_TRIGGER(fcinfo)) /* internal error */ + elog(ERROR, "tsvector_update_trigger: not fired by trigger manager"); + + trigdata = (TriggerData *) fcinfo->context; + if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event)) + elog(ERROR, "tsvector_update_trigger: can't process STATEMENT events"); + if (TRIGGER_FIRED_AFTER(trigdata->tg_event)) + elog(ERROR, "tsvector_update_trigger: must be fired BEFORE event"); + + if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event)) + rettuple = trigdata->tg_trigtuple; + else if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event)) + rettuple = trigdata->tg_newtuple; + else + elog(ERROR, "tsvector_update_trigger: must be fired for INSERT or UPDATE"); + + trigger = trigdata->tg_trigger; + rel = trigdata->tg_relation; + + if (trigger->tgnargs < 3) + elog(ERROR, "tsvector_update_trigger: arguments must be tsvector_field, ts_config, text_field1, ...)"); + + /* Find the target tsvector column */ + tsvector_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[0]); + if (tsvector_attr_num == SPI_ERROR_NOATTRIBUTE) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("tsvector column \"%s\" does not exist", + trigger->tgargs[0]))); + if (SPI_gettypeid(rel->rd_att, tsvector_attr_num) != TSVECTOROID) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("column \"%s\" is not of tsvector type", + trigger->tgargs[0]))); + + /* Find the configuration to use */ + if (config_column) + { + int config_attr_num; + + config_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[1]); + if (config_attr_num == SPI_ERROR_NOATTRIBUTE) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("config column \"%s\" does not exist", + trigger->tgargs[1]))); + if (SPI_gettypeid(rel->rd_att, config_attr_num) != REGCONFIGOID) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("column \"%s\" is not of regconfig type", + trigger->tgargs[1]))); + + datum = SPI_getbinval(rettuple, rel->rd_att, config_attr_num, &isnull); + if (isnull) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("config column \"%s\" must not be NULL", + trigger->tgargs[1]))); + cfgId = DatumGetObjectId(datum); + } + else + { + List *names; + + names = stringToQualifiedNameList(trigger->tgargs[1]); + /* require a schema so that results are not search path dependent */ + if (list_length(names) < 2) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("text search configuration name \"%s\" must be schema-qualified", + trigger->tgargs[1]))); + cfgId = TSConfigGetCfgid(names, false); + } + + /* initialize parse state */ + prs.lenwords = 32; + prs.curwords = 0; + prs.pos = 0; + prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); + + /* find all words in indexable column(s) */ + for (i = 2; i < trigger->tgnargs; i++) + { + int numattr; + + numattr = SPI_fnumber(rel->rd_att, trigger->tgargs[i]); + if (numattr == SPI_ERROR_NOATTRIBUTE) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("column \"%s\" does not exist", + trigger->tgargs[i]))); + if (!istexttype(SPI_gettypeid(rel->rd_att, numattr))) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("column \"%s\" is not of character type", + trigger->tgargs[i]))); + + datum = SPI_getbinval(rettuple, rel->rd_att, numattr, &isnull); + if (isnull) + continue; + + txt = DatumGetTextP(datum); + + parsetext(cfgId, &prs, VARDATA(txt), VARSIZE(txt) - VARHDRSZ); + + if (txt != (text *) DatumGetPointer(datum)) + pfree(txt); + } + + /* make tsvector value */ + if (prs.curwords) + { + datum = PointerGetDatum(make_tsvector(&prs)); + rettuple = SPI_modifytuple(rel, rettuple, 1, &tsvector_attr_num, + &datum, NULL); + pfree(DatumGetPointer(datum)); + } + else + { + TSVector out = palloc(CALCDATASIZE(0, 0)); + + SET_VARSIZE(out, CALCDATASIZE(0, 0)); + out->size = 0; + datum = PointerGetDatum(out); + rettuple = SPI_modifytuple(rel, rettuple, 1, &tsvector_attr_num, + &datum, NULL); + pfree(prs.words); + } + + if (rettuple == NULL) /* internal error */ + elog(ERROR, "tsvector_update_trigger: %d returned by SPI_modifytuple", + SPI_result); + + return PointerGetDatum(rettuple); +} |