diff options
Diffstat (limited to 'src/backend/utils/adt')
-rw-r--r-- | src/backend/utils/adt/Makefile | 9 | ||||
-rw-r--r-- | src/backend/utils/adt/regproc.c | 229 | ||||
-rw-r--r-- | src/backend/utils/adt/selfuncs.c | 6 | ||||
-rw-r--r-- | src/backend/utils/adt/tsginidx.c | 157 | ||||
-rw-r--r-- | src/backend/utils/adt/tsgistidx.c | 784 | ||||
-rw-r--r-- | src/backend/utils/adt/tsquery.c | 767 | ||||
-rw-r--r-- | src/backend/utils/adt/tsquery_cleanup.c | 261 | ||||
-rw-r--r-- | src/backend/utils/adt/tsquery_gist.c | 259 | ||||
-rw-r--r-- | src/backend/utils/adt/tsquery_op.c | 289 | ||||
-rw-r--r-- | src/backend/utils/adt/tsquery_rewrite.c | 524 | ||||
-rw-r--r-- | src/backend/utils/adt/tsquery_util.c | 317 | ||||
-rw-r--r-- | src/backend/utils/adt/tsrank.c | 804 | ||||
-rw-r--r-- | src/backend/utils/adt/tsvector.c | 683 | ||||
-rw-r--r-- | src/backend/utils/adt/tsvector_op.c | 1334 |
14 files changed, 6418 insertions, 5 deletions
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index 12d158a49bd..73fcebb496d 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -1,7 +1,7 @@ # # Makefile for utils/adt # -# $PostgreSQL: pgsql/src/backend/utils/adt/Makefile,v 1.64 2007/04/02 03:49:39 tgl Exp $ +# $PostgreSQL: pgsql/src/backend/utils/adt/Makefile,v 1.65 2007/08/21 01:11:18 tgl Exp $ # subdir = src/backend/utils/adt @@ -25,8 +25,11 @@ OBJS = acl.o arrayfuncs.o array_userfuncs.o arrayutils.o bool.o \ tid.o timestamp.o varbit.o varchar.o varlena.o version.o xid.o \ network.o mac.o inet_net_ntop.o inet_net_pton.o \ ri_triggers.o pg_lzcompress.o pg_locale.o formatting.o \ - ascii.o quote.o pgstatfuncs.o encode.o dbsize.o genfile.o xml.o \ - uuid.o + ascii.o quote.o pgstatfuncs.o encode.o dbsize.o genfile.o \ + tsginidx.o tsgistidx.o tsquery.o tsquery_cleanup.o tsquery_gist.o \ + tsquery_op.o tsquery_rewrite.o tsquery_util.o tsrank.o \ + tsvector.o tsvector_op.o \ + uuid.o xml.o like.o: like.c like_match.c diff --git a/src/backend/utils/adt/regproc.c b/src/backend/utils/adt/regproc.c index 7ec9db3ec9b..e49f323daa8 100644 --- a/src/backend/utils/adt/regproc.c +++ b/src/backend/utils/adt/regproc.c @@ -13,7 +13,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/regproc.c,v 1.102 2007/06/26 16:48:09 alvherre Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/regproc.c,v 1.103 2007/08/21 01:11:18 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -27,6 +27,8 @@ #include "catalog/namespace.h" #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" +#include "catalog/pg_ts_config.h" +#include "catalog/pg_ts_dict.h" #include "catalog/pg_type.h" #include "miscadmin.h" #include "parser/parse_type.h" @@ -1066,6 +1068,231 @@ regtypesend(PG_FUNCTION_ARGS) /* + * regconfigin - converts "tsconfigname" to tsconfig OID + * + * We also accept a numeric OID, for symmetry with the output routine. + * + * '-' signifies unknown (OID 0). In all other cases, the input must + * match an existing pg_ts_config entry. + * + * This function is not needed in bootstrap mode, so we don't worry about + * making it work then. + */ +Datum +regconfigin(PG_FUNCTION_ARGS) +{ + char *cfg_name_or_oid = PG_GETARG_CSTRING(0); + Oid result; + List *names; + + /* '-' ? */ + if (strcmp(cfg_name_or_oid, "-") == 0) + PG_RETURN_OID(InvalidOid); + + /* Numeric OID? */ + if (cfg_name_or_oid[0] >= '0' && + cfg_name_or_oid[0] <= '9' && + strspn(cfg_name_or_oid, "0123456789") == strlen(cfg_name_or_oid)) + { + result = DatumGetObjectId(DirectFunctionCall1(oidin, + CStringGetDatum(cfg_name_or_oid))); + PG_RETURN_OID(result); + } + + /* + * Normal case: parse the name into components and see if it matches any + * pg_ts_config entries in the current search path. + */ + names = stringToQualifiedNameList(cfg_name_or_oid); + + result = TSConfigGetCfgid(names, false); + + PG_RETURN_OID(result); +} + +/* + * regconfigout - converts tsconfig OID to "tsconfigname" + */ +Datum +regconfigout(PG_FUNCTION_ARGS) +{ + Oid cfgid = PG_GETARG_OID(0); + char *result; + HeapTuple cfgtup; + + if (cfgid == InvalidOid) + { + result = pstrdup("-"); + PG_RETURN_CSTRING(result); + } + + cfgtup = SearchSysCache(TSCONFIGOID, + ObjectIdGetDatum(cfgid), + 0, 0, 0); + + if (HeapTupleIsValid(cfgtup)) + { + Form_pg_ts_config cfgform = (Form_pg_ts_config) GETSTRUCT(cfgtup); + char *cfgname = NameStr(cfgform->cfgname); + char *nspname; + + /* + * Would this config be found by regconfigin? If not, qualify it. + */ + if (TSConfigIsVisible(cfgid)) + nspname = NULL; + else + nspname = get_namespace_name(cfgform->cfgnamespace); + + result = quote_qualified_identifier(nspname, cfgname); + + ReleaseSysCache(cfgtup); + } + else + { + /* If OID doesn't match any pg_ts_config row, return it numerically */ + result = (char *) palloc(NAMEDATALEN); + snprintf(result, NAMEDATALEN, "%u", cfgid); + } + + PG_RETURN_CSTRING(result); +} + +/* + * regconfigrecv - converts external binary format to regconfig + */ +Datum +regconfigrecv(PG_FUNCTION_ARGS) +{ + /* Exactly the same as oidrecv, so share code */ + return oidrecv(fcinfo); +} + +/* + * regconfigsend - converts regconfig to binary format + */ +Datum +regconfigsend(PG_FUNCTION_ARGS) +{ + /* Exactly the same as oidsend, so share code */ + return oidsend(fcinfo); +} + + +/* + * regdictionaryin - converts "tsdictionaryname" to tsdictionary OID + * + * We also accept a numeric OID, for symmetry with the output routine. + * + * '-' signifies unknown (OID 0). In all other cases, the input must + * match an existing pg_ts_dict entry. + * + * This function is not needed in bootstrap mode, so we don't worry about + * making it work then. + */ +Datum +regdictionaryin(PG_FUNCTION_ARGS) +{ + char *dict_name_or_oid = PG_GETARG_CSTRING(0); + Oid result; + List *names; + + /* '-' ? */ + if (strcmp(dict_name_or_oid, "-") == 0) + PG_RETURN_OID(InvalidOid); + + /* Numeric OID? */ + if (dict_name_or_oid[0] >= '0' && + dict_name_or_oid[0] <= '9' && + strspn(dict_name_or_oid, "0123456789") == strlen(dict_name_or_oid)) + { + result = DatumGetObjectId(DirectFunctionCall1(oidin, + CStringGetDatum(dict_name_or_oid))); + PG_RETURN_OID(result); + } + + /* + * Normal case: parse the name into components and see if it matches any + * pg_ts_dict entries in the current search path. + */ + names = stringToQualifiedNameList(dict_name_or_oid); + + result = TSDictionaryGetDictid(names, false); + + PG_RETURN_OID(result); +} + +/* + * regdictionaryout - converts tsdictionary OID to "tsdictionaryname" + */ +Datum +regdictionaryout(PG_FUNCTION_ARGS) +{ + Oid dictid = PG_GETARG_OID(0); + char *result; + HeapTuple dicttup; + + if (dictid == InvalidOid) + { + result = pstrdup("-"); + PG_RETURN_CSTRING(result); + } + + dicttup = SearchSysCache(TSDICTOID, + ObjectIdGetDatum(dictid), + 0, 0, 0); + + if (HeapTupleIsValid(dicttup)) + { + Form_pg_ts_dict dictform = (Form_pg_ts_dict) GETSTRUCT(dicttup); + char *dictname = NameStr(dictform->dictname); + char *nspname; + + /* + * Would this dictionary be found by regdictionaryin? + * If not, qualify it. + */ + if (TSDictionaryIsVisible(dictid)) + nspname = NULL; + else + nspname = get_namespace_name(dictform->dictnamespace); + + result = quote_qualified_identifier(nspname, dictname); + + ReleaseSysCache(dicttup); + } + else + { + /* If OID doesn't match any pg_ts_dict row, return it numerically */ + result = (char *) palloc(NAMEDATALEN); + snprintf(result, NAMEDATALEN, "%u", dictid); + } + + PG_RETURN_CSTRING(result); +} + +/* + * regdictionaryrecv - converts external binary format to regdictionary + */ +Datum +regdictionaryrecv(PG_FUNCTION_ARGS) +{ + /* Exactly the same as oidrecv, so share code */ + return oidrecv(fcinfo); +} + +/* + * regdictionarysend - converts regdictionary to binary format + */ +Datum +regdictionarysend(PG_FUNCTION_ARGS) +{ + /* Exactly the same as oidsend, so share code */ + return oidsend(fcinfo); +} + + +/* * text_regclass: convert text to regclass * * This could be replaced by CoerceViaIO, except that we need to treat diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 1ba0b4eac48..60da08b1ec4 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -15,7 +15,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.234 2007/05/05 17:05:48 mha Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.235 2007/08/21 01:11:18 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -2822,6 +2822,8 @@ convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue, case REGOPERATOROID: case REGCLASSOID: case REGTYPEOID: + case REGCONFIGOID: + case REGDICTIONARYOID: *scaledvalue = convert_numeric_to_scalar(value, valuetypid); *scaledlobound = convert_numeric_to_scalar(lobound, boundstypid); *scaledhibound = convert_numeric_to_scalar(hibound, boundstypid); @@ -2925,6 +2927,8 @@ convert_numeric_to_scalar(Datum value, Oid typid) case REGOPERATOROID: case REGCLASSOID: case REGTYPEOID: + case REGCONFIGOID: + case REGDICTIONARYOID: /* we can treat OIDs as integers... */ return (double) DatumGetObjectId(value); } diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c new file mode 100644 index 00000000000..491dd21aa81 --- /dev/null +++ b/src/backend/utils/adt/tsginidx.c @@ -0,0 +1,157 @@ +/*------------------------------------------------------------------------- + * + * tsginidx.c + * GIN support functions for tsvector_ops + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsginidx.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/skey.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" + + +Datum +gin_extract_tsvector(PG_FUNCTION_ARGS) +{ + TSVector vector = PG_GETARG_TSVECTOR(0); + uint32 *nentries = (uint32 *) PG_GETARG_POINTER(1); + Datum *entries = NULL; + + *nentries = 0; + if (vector->size > 0) + { + int i; + WordEntry *we = ARRPTR(vector); + + *nentries = (uint32) vector->size; + entries = (Datum *) palloc(sizeof(Datum) * vector->size); + + for (i = 0; i < vector->size; i++) + { + text *txt = (text *) palloc(VARHDRSZ + we->len); + + SET_VARSIZE(txt, VARHDRSZ + we->len); + memcpy(VARDATA(txt), STRPTR(vector) + we->pos, we->len); + + entries[i] = PointerGetDatum(txt); + + we++; + } + } + + PG_FREE_IF_COPY(vector, 0); + PG_RETURN_POINTER(entries); +} + +Datum +gin_extract_query(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY(0); + uint32 *nentries = (uint32 *) PG_GETARG_POINTER(1); + StrategyNumber strategy = PG_GETARG_UINT16(2); + Datum *entries = NULL; + + *nentries = 0; + + if (query->size > 0) + { + int4 i, + j = 0, + len; + QueryItem *item; + + item = clean_NOT(GETQUERY(query), &len); + if (!item) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("query requires full scan, which is not supported by GIN indexes"))); + + item = GETQUERY(query); + + for (i = 0; i < query->size; i++) + if (item[i].type == VAL) + (*nentries)++; + + entries = (Datum *) palloc(sizeof(Datum) * (*nentries)); + + for (i = 0; i < query->size; i++) + if (item[i].type == VAL) + { + text *txt; + + txt = (text *) palloc(VARHDRSZ + item[i].length); + + SET_VARSIZE(txt, VARHDRSZ + item[i].length); + memcpy(VARDATA(txt), GETOPERAND(query) + item[i].distance, item[i].length); + + entries[j++] = PointerGetDatum(txt); + + if (strategy != TSearchWithClassStrategyNumber && item[i].weight != 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("@@ operator does not support lexeme class restrictions"), + errhint("Use the @@@ operator instead."))); + } + } + else + *nentries = -1; /* nothing can be found */ + + PG_FREE_IF_COPY(query, 0); + + PG_RETURN_POINTER(entries); +} + +typedef struct +{ + QueryItem *frst; + bool *mapped_check; +} GinChkVal; + +static bool +checkcondition_gin(void *checkval, QueryItem * val) +{ + GinChkVal *gcv = (GinChkVal *) checkval; + + return gcv->mapped_check[val - gcv->frst]; +} + +Datum +gin_ts_consistent(PG_FUNCTION_ARGS) +{ + bool *check = (bool *) PG_GETARG_POINTER(0); + /* StrategyNumber strategy = PG_GETARG_UINT16(1); */ + TSQuery query = PG_GETARG_TSQUERY(2); + bool res = FALSE; + + if (query->size > 0) + { + int4 i, + j = 0; + QueryItem *item; + GinChkVal gcv; + + gcv.frst = item = GETQUERY(query); + gcv.mapped_check = (bool *) palloc(sizeof(bool) * query->size); + + for (i = 0; i < query->size; i++) + if (item[i].type == VAL) + gcv.mapped_check[i] = check[j++]; + + res = TS_execute( + GETQUERY(query), + &gcv, + true, + checkcondition_gin + ); + } + + PG_RETURN_BOOL(res); +} diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c new file mode 100644 index 00000000000..04c2b4c442f --- /dev/null +++ b/src/backend/utils/adt/tsgistidx.c @@ -0,0 +1,784 @@ +/*------------------------------------------------------------------------- + * + * tsgistidx.c + * GiST support functions for tsvector_ops + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsgistidx.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gist.h" +#include "access/tuptoaster.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" +#include "utils/pg_crc.h" + + +#define SIGLENINT 31 /* >121 => key will toast, so it will not work + * !!! */ + +#define SIGLEN ( sizeof(int4) * SIGLENINT ) +#define SIGLENBIT (SIGLEN * BITS_PER_BYTE) + +typedef char BITVEC[SIGLEN]; +typedef char *BITVECP; + +#define LOOPBYTE(a) \ + for(i=0;i<SIGLEN;i++) {\ + a;\ +} + +#define GETBYTE(x,i) ( *( (BITVECP)(x) + (int)( (i) / BITS_PER_BYTE ) ) ) +#define GETBITBYTE(x,i) ( ((char)(x)) >> (i) & 0x01 ) +#define CLRBIT(x,i) GETBYTE(x,i) &= ~( 0x01 << ( (i) % BITS_PER_BYTE ) ) +#define SETBIT(x,i) GETBYTE(x,i) |= ( 0x01 << ( (i) % BITS_PER_BYTE ) ) +#define GETBIT(x,i) ( (GETBYTE(x,i) >> ( (i) % BITS_PER_BYTE )) & 0x01 ) + +#define HASHVAL(val) (((unsigned int)(val)) % SIGLENBIT) +#define HASH(sign, val) SETBIT((sign), HASHVAL(val)) + +#define GETENTRY(vec,pos) ((SignTSVector *) DatumGetPointer((vec)->vector[(pos)].key)) + +/* + * type of GiST index key + */ + +typedef struct +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ ; + int4 flag; + char data[1]; +} SignTSVector; + +#define ARRKEY 0x01 +#define SIGNKEY 0x02 +#define ALLISTRUE 0x04 + +#define ISARRKEY(x) ( ((SignTSVector*)(x))->flag & ARRKEY ) +#define ISSIGNKEY(x) ( ((SignTSVector*)(x))->flag & SIGNKEY ) +#define ISALLTRUE(x) ( ((SignTSVector*)(x))->flag & ALLISTRUE ) + +#define GTHDRSIZE ( VARHDRSZ + sizeof(int4) ) +#define CALCGTSIZE(flag, len) ( GTHDRSIZE + ( ( (flag) & ARRKEY ) ? ((len)*sizeof(int4)) : (((flag) & ALLISTRUE) ? 0 : SIGLEN) ) ) + +#define GETSIGN(x) ( (BITVECP)( (char*)(x)+GTHDRSIZE ) ) +#define GETARR(x) ( (int4*)( (char*)(x)+GTHDRSIZE ) ) +#define ARRNELEM(x) ( ( VARSIZE(x) - GTHDRSIZE )/sizeof(int4) ) + +/* Number of one-bits in an unsigned byte */ +static const uint8 number_of_ones[256] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 +}; + +static int4 sizebitvec(BITVECP sign); + +Datum +gtsvectorin(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("gtsvector_in not implemented"))); + PG_RETURN_DATUM(0); +} + +#define SINGOUTSTR "%d true bits, %d false bits" +#define ARROUTSTR "%d unique words" +#define EXTRALEN ( 2*13 ) + +static int outbuf_maxlen = 0; + +Datum +gtsvectorout(PG_FUNCTION_ARGS) +{ + SignTSVector *key = (SignTSVector *) DatumGetPointer(PG_DETOAST_DATUM(PG_GETARG_POINTER(0))); + char *outbuf; + + if (outbuf_maxlen == 0) + outbuf_maxlen = 2 * EXTRALEN + Max(strlen(SINGOUTSTR), strlen(ARROUTSTR)) + 1; + outbuf = palloc(outbuf_maxlen); + + if (ISARRKEY(key)) + sprintf(outbuf, ARROUTSTR, (int) ARRNELEM(key)); + else + { + int cnttrue = (ISALLTRUE(key)) ? SIGLENBIT : sizebitvec(GETSIGN(key)); + + sprintf(outbuf, SINGOUTSTR, cnttrue, (int) SIGLENBIT - cnttrue); + } + + PG_FREE_IF_COPY(key, 0); + PG_RETURN_POINTER(outbuf); +} + +static int +compareint(const void *a, const void *b) +{ + if (*((int4 *) a) == *((int4 *) b)) + return 0; + return (*((int4 *) a) > *((int4 *) b)) ? 1 : -1; +} + +static int +uniqueint(int4 *a, int4 l) +{ + int4 *ptr, + *res; + + if (l == 1) + return l; + + ptr = res = a; + + qsort((void *) a, l, sizeof(int4), compareint); + + while (ptr - a < l) + if (*ptr != *res) + *(++res) = *ptr++; + else + ptr++; + return res + 1 - a; +} + +static void +makesign(BITVECP sign, SignTSVector * a) +{ + int4 k, + len = ARRNELEM(a); + int4 *ptr = GETARR(a); + + MemSet((void *) sign, 0, sizeof(BITVEC)); + for (k = 0; k < len; k++) + HASH(sign, ptr[k]); +} + +Datum +gtsvector_compress(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + GISTENTRY *retval = entry; + + if (entry->leafkey) + { /* tsvector */ + SignTSVector *res; + TSVector val = DatumGetTSVector(entry->key); + int4 len; + int4 *arr; + WordEntry *ptr = ARRPTR(val); + char *words = STRPTR(val); + + len = CALCGTSIZE(ARRKEY, val->size); + res = (SignTSVector *) palloc(len); + SET_VARSIZE(res, len); + res->flag = ARRKEY; + arr = GETARR(res); + len = val->size; + while (len--) + { + pg_crc32 c; + + INIT_CRC32(c); + COMP_CRC32(c, words + ptr->pos, ptr->len); + FIN_CRC32(c); + + *arr = *(int4 *) &c; + arr++; + ptr++; + } + + len = uniqueint(GETARR(res), val->size); + if (len != val->size) + { + /* + * there is a collision of hash-function; len is always less than + * val->size + */ + len = CALCGTSIZE(ARRKEY, len); + res = (SignTSVector *) repalloc((void *) res, len); + SET_VARSIZE(res, len); + } + + /* make signature, if array is too long */ + if (VARSIZE(res) > TOAST_INDEX_TARGET) + { + SignTSVector *ressign; + + len = CALCGTSIZE(SIGNKEY, 0); + ressign = (SignTSVector *) palloc(len); + SET_VARSIZE(ressign, len); + ressign->flag = SIGNKEY; + makesign(GETSIGN(ressign), res); + res = ressign; + } + + retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + gistentryinit(*retval, PointerGetDatum(res), + entry->rel, entry->page, + entry->offset, FALSE); + } + else if (ISSIGNKEY(DatumGetPointer(entry->key)) && + !ISALLTRUE(DatumGetPointer(entry->key))) + { + int4 i, + len; + SignTSVector *res; + BITVECP sign = GETSIGN(DatumGetPointer(entry->key)); + + LOOPBYTE( + if ((sign[i] & 0xff) != 0xff) + PG_RETURN_POINTER(retval); + ); + + len = CALCGTSIZE(SIGNKEY | ALLISTRUE, 0); + res = (SignTSVector *) palloc(len); + SET_VARSIZE(res, len); + res->flag = SIGNKEY | ALLISTRUE; + + retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + gistentryinit(*retval, PointerGetDatum(res), + entry->rel, entry->page, + entry->offset, FALSE); + } + PG_RETURN_POINTER(retval); +} + +Datum +gtsvector_decompress(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + SignTSVector *key = (SignTSVector *) DatumGetPointer(PG_DETOAST_DATUM(entry->key)); + + if (key != (SignTSVector *) DatumGetPointer(entry->key)) + { + GISTENTRY *retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + + gistentryinit(*retval, PointerGetDatum(key), + entry->rel, entry->page, + entry->offset, FALSE); + + PG_RETURN_POINTER(retval); + } + + PG_RETURN_POINTER(entry); +} + +typedef struct +{ + int4 *arrb; + int4 *arre; +} CHKVAL; + +/* + * is there value 'val' in array or not ? + */ +static bool +checkcondition_arr(void *checkval, QueryItem * val) +{ + int4 *StopLow = ((CHKVAL *) checkval)->arrb; + int4 *StopHigh = ((CHKVAL *) checkval)->arre; + int4 *StopMiddle; + + /* Loop invariant: StopLow <= val < StopHigh */ + + while (StopLow < StopHigh) + { + StopMiddle = StopLow + (StopHigh - StopLow) / 2; + if (*StopMiddle == val->val) + return (true); + else if (*StopMiddle < val->val) + StopLow = StopMiddle + 1; + else + StopHigh = StopMiddle; + } + + return (false); +} + +static bool +checkcondition_bit(void *checkval, QueryItem * val) +{ + return GETBIT(checkval, HASHVAL(val->val)); +} + +Datum +gtsvector_consistent(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY(1); + SignTSVector *key = (SignTSVector *) DatumGetPointer( + ((GISTENTRY *) PG_GETARG_POINTER(0))->key + ); + + if (!query->size) + PG_RETURN_BOOL(false); + + if (ISSIGNKEY(key)) + { + if (ISALLTRUE(key)) + PG_RETURN_BOOL(true); + + PG_RETURN_BOOL(TS_execute( + GETQUERY(query), + (void *) GETSIGN(key), false, + checkcondition_bit + )); + } + else + { /* only leaf pages */ + CHKVAL chkval; + + chkval.arrb = GETARR(key); + chkval.arre = chkval.arrb + ARRNELEM(key); + PG_RETURN_BOOL(TS_execute( + GETQUERY(query), + (void *) &chkval, true, + checkcondition_arr + )); + } +} + +static int4 +unionkey(BITVECP sbase, SignTSVector * add) +{ + int4 i; + + if (ISSIGNKEY(add)) + { + BITVECP sadd = GETSIGN(add); + + if (ISALLTRUE(add)) + return 1; + + LOOPBYTE( + sbase[i] |= sadd[i]; + ); + } + else + { + int4 *ptr = GETARR(add); + + for (i = 0; i < ARRNELEM(add); i++) + HASH(sbase, ptr[i]); + } + return 0; +} + + +Datum +gtsvector_union(PG_FUNCTION_ARGS) +{ + GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); + int *size = (int *) PG_GETARG_POINTER(1); + BITVEC base; + int4 i, + len; + int4 flag = 0; + SignTSVector *result; + + MemSet((void *) base, 0, sizeof(BITVEC)); + for (i = 0; i < entryvec->n; i++) + { + if (unionkey(base, GETENTRY(entryvec, i))) + { + flag = ALLISTRUE; + break; + } + } + + flag |= SIGNKEY; + len = CALCGTSIZE(flag, 0); + result = (SignTSVector *) palloc(len); + *size = len; + SET_VARSIZE(result, len); + result->flag = flag; + if (!ISALLTRUE(result)) + memcpy((void *) GETSIGN(result), (void *) base, sizeof(BITVEC)); + + PG_RETURN_POINTER(result); +} + +Datum +gtsvector_same(PG_FUNCTION_ARGS) +{ + SignTSVector *a = (SignTSVector *) PG_GETARG_POINTER(0); + SignTSVector *b = (SignTSVector *) PG_GETARG_POINTER(1); + bool *result = (bool *) PG_GETARG_POINTER(2); + + if (ISSIGNKEY(a)) + { /* then b also ISSIGNKEY */ + if (ISALLTRUE(a) && ISALLTRUE(b)) + *result = true; + else if (ISALLTRUE(a)) + *result = false; + else if (ISALLTRUE(b)) + *result = false; + else + { + int4 i; + BITVECP sa = GETSIGN(a), + sb = GETSIGN(b); + + *result = true; + LOOPBYTE( + if (sa[i] != sb[i]) + { + *result = false; + break; + } + ); + } + } + else + { /* a and b ISARRKEY */ + int4 lena = ARRNELEM(a), + lenb = ARRNELEM(b); + + if (lena != lenb) + *result = false; + else + { + int4 *ptra = GETARR(a), + *ptrb = GETARR(b); + int4 i; + + *result = true; + for (i = 0; i < lena; i++) + if (ptra[i] != ptrb[i]) + { + *result = false; + break; + } + } + } + + PG_RETURN_POINTER(result); +} + +static int4 +sizebitvec(BITVECP sign) +{ + int4 size = 0, + i; + + LOOPBYTE( + size += number_of_ones[(unsigned char) sign[i]]; + ); + return size; +} + +static int +hemdistsign(BITVECP a, BITVECP b) +{ + int i, + diff, + dist = 0; + + LOOPBYTE( + diff = (unsigned char) (a[i] ^ b[i]); + dist += number_of_ones[diff]; + ); + return dist; +} + +static int +hemdist(SignTSVector * a, SignTSVector * b) +{ + if (ISALLTRUE(a)) + { + if (ISALLTRUE(b)) + return 0; + else + return SIGLENBIT - sizebitvec(GETSIGN(b)); + } + else if (ISALLTRUE(b)) + return SIGLENBIT - sizebitvec(GETSIGN(a)); + + return hemdistsign(GETSIGN(a), GETSIGN(b)); +} + +Datum +gtsvector_penalty(PG_FUNCTION_ARGS) +{ + GISTENTRY *origentry = (GISTENTRY *) PG_GETARG_POINTER(0); /* always ISSIGNKEY */ + GISTENTRY *newentry = (GISTENTRY *) PG_GETARG_POINTER(1); + float *penalty = (float *) PG_GETARG_POINTER(2); + SignTSVector *origval = (SignTSVector *) DatumGetPointer(origentry->key); + SignTSVector *newval = (SignTSVector *) DatumGetPointer(newentry->key); + BITVECP orig = GETSIGN(origval); + + *penalty = 0.0; + + if (ISARRKEY(newval)) + { + BITVEC sign; + + makesign(sign, newval); + + if (ISALLTRUE(origval)) + *penalty = ((float) (SIGLENBIT - sizebitvec(sign))) / (float) (SIGLENBIT + 1); + else + *penalty = hemdistsign(sign, orig); + } + else + *penalty = hemdist(origval, newval); + PG_RETURN_POINTER(penalty); +} + +typedef struct +{ + bool allistrue; + BITVEC sign; +} CACHESIGN; + +static void +fillcache(CACHESIGN * item, SignTSVector * key) +{ + item->allistrue = false; + if (ISARRKEY(key)) + makesign(item->sign, key); + else if (ISALLTRUE(key)) + item->allistrue = true; + else + memcpy((void *) item->sign, (void *) GETSIGN(key), sizeof(BITVEC)); +} + +#define WISH_F(a,b,c) (double)( -(double)(((a)-(b))*((a)-(b))*((a)-(b)))*(c) ) +typedef struct +{ + OffsetNumber pos; + int4 cost; +} SPLITCOST; + +static int +comparecost(const void *a, const void *b) +{ + if (((SPLITCOST *) a)->cost == ((SPLITCOST *) b)->cost) + return 0; + else + return (((SPLITCOST *) a)->cost > ((SPLITCOST *) b)->cost) ? 1 : -1; +} + + +static int +hemdistcache(CACHESIGN * a, CACHESIGN * b) +{ + if (a->allistrue) + { + if (b->allistrue) + return 0; + else + return SIGLENBIT - sizebitvec(b->sign); + } + else if (b->allistrue) + return SIGLENBIT - sizebitvec(a->sign); + + return hemdistsign(a->sign, b->sign); +} + +Datum +gtsvector_picksplit(PG_FUNCTION_ARGS) +{ + GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); + GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1); + OffsetNumber k, + j; + SignTSVector *datum_l, + *datum_r; + BITVECP union_l, + union_r; + int4 size_alpha, + size_beta; + int4 size_waste, + waste = -1; + int4 nbytes; + OffsetNumber seed_1 = 0, + seed_2 = 0; + OffsetNumber *left, + *right; + OffsetNumber maxoff; + BITVECP ptr; + int i; + CACHESIGN *cache; + SPLITCOST *costvector; + + maxoff = entryvec->n - 2; + nbytes = (maxoff + 2) * sizeof(OffsetNumber); + v->spl_left = (OffsetNumber *) palloc(nbytes); + v->spl_right = (OffsetNumber *) palloc(nbytes); + + cache = (CACHESIGN *) palloc(sizeof(CACHESIGN) * (maxoff + 2)); + fillcache(&cache[FirstOffsetNumber], GETENTRY(entryvec, FirstOffsetNumber)); + + for (k = FirstOffsetNumber; k < maxoff; k = OffsetNumberNext(k)) + { + for (j = OffsetNumberNext(k); j <= maxoff; j = OffsetNumberNext(j)) + { + if (k == FirstOffsetNumber) + fillcache(&cache[j], GETENTRY(entryvec, j)); + + size_waste = hemdistcache(&(cache[j]), &(cache[k])); + if (size_waste > waste) + { + waste = size_waste; + seed_1 = k; + seed_2 = j; + } + } + } + + left = v->spl_left; + v->spl_nleft = 0; + right = v->spl_right; + v->spl_nright = 0; + + if (seed_1 == 0 || seed_2 == 0) + { + seed_1 = 1; + seed_2 = 2; + } + + /* form initial .. */ + if (cache[seed_1].allistrue) + { + datum_l = (SignTSVector *) palloc(CALCGTSIZE(SIGNKEY | ALLISTRUE, 0)); + SET_VARSIZE(datum_l, CALCGTSIZE(SIGNKEY | ALLISTRUE, 0)); + datum_l->flag = SIGNKEY | ALLISTRUE; + } + else + { + datum_l = (SignTSVector *) palloc(CALCGTSIZE(SIGNKEY, 0)); + SET_VARSIZE(datum_l, CALCGTSIZE(SIGNKEY, 0)); + datum_l->flag = SIGNKEY; + memcpy((void *) GETSIGN(datum_l), (void *) cache[seed_1].sign, sizeof(BITVEC)); + } + if (cache[seed_2].allistrue) + { + datum_r = (SignTSVector *) palloc(CALCGTSIZE(SIGNKEY | ALLISTRUE, 0)); + SET_VARSIZE(datum_r, CALCGTSIZE(SIGNKEY | ALLISTRUE, 0)); + datum_r->flag = SIGNKEY | ALLISTRUE; + } + else + { + datum_r = (SignTSVector *) palloc(CALCGTSIZE(SIGNKEY, 0)); + SET_VARSIZE(datum_r, CALCGTSIZE(SIGNKEY, 0)); + datum_r->flag = SIGNKEY; + memcpy((void *) GETSIGN(datum_r), (void *) cache[seed_2].sign, sizeof(BITVEC)); + } + + union_l = GETSIGN(datum_l); + union_r = GETSIGN(datum_r); + maxoff = OffsetNumberNext(maxoff); + fillcache(&cache[maxoff], GETENTRY(entryvec, maxoff)); + /* sort before ... */ + costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff); + for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j)) + { + costvector[j - 1].pos = j; + size_alpha = hemdistcache(&(cache[seed_1]), &(cache[j])); + size_beta = hemdistcache(&(cache[seed_2]), &(cache[j])); + costvector[j - 1].cost = Abs(size_alpha - size_beta); + } + qsort((void *) costvector, maxoff, sizeof(SPLITCOST), comparecost); + + for (k = 0; k < maxoff; k++) + { + j = costvector[k].pos; + if (j == seed_1) + { + *left++ = j; + v->spl_nleft++; + continue; + } + else if (j == seed_2) + { + *right++ = j; + v->spl_nright++; + continue; + } + + if (ISALLTRUE(datum_l) || cache[j].allistrue) + { + if (ISALLTRUE(datum_l) && cache[j].allistrue) + size_alpha = 0; + else + size_alpha = SIGLENBIT - sizebitvec( + (cache[j].allistrue) ? GETSIGN(datum_l) : GETSIGN(cache[j].sign) + ); + } + else + size_alpha = hemdistsign(cache[j].sign, GETSIGN(datum_l)); + + if (ISALLTRUE(datum_r) || cache[j].allistrue) + { + if (ISALLTRUE(datum_r) && cache[j].allistrue) + size_beta = 0; + else + size_beta = SIGLENBIT - sizebitvec( + (cache[j].allistrue) ? GETSIGN(datum_r) : GETSIGN(cache[j].sign) + ); + } + else + size_beta = hemdistsign(cache[j].sign, GETSIGN(datum_r)); + + if (size_alpha < size_beta + WISH_F(v->spl_nleft, v->spl_nright, 0.1)) + { + if (ISALLTRUE(datum_l) || cache[j].allistrue) + { + if (!ISALLTRUE(datum_l)) + MemSet((void *) GETSIGN(datum_l), 0xff, sizeof(BITVEC)); + } + else + { + ptr = cache[j].sign; + LOOPBYTE( + union_l[i] |= ptr[i]; + ); + } + *left++ = j; + v->spl_nleft++; + } + else + { + if (ISALLTRUE(datum_r) || cache[j].allistrue) + { + if (!ISALLTRUE(datum_r)) + MemSet((void *) GETSIGN(datum_r), 0xff, sizeof(BITVEC)); + } + else + { + ptr = cache[j].sign; + LOOPBYTE( + union_r[i] |= ptr[i]; + ); + } + *right++ = j; + v->spl_nright++; + } + } + + *right = *left = FirstOffsetNumber; + v->spl_ldatum = PointerGetDatum(datum_l); + v->spl_rdatum = PointerGetDatum(datum_r); + + PG_RETURN_POINTER(v); +} diff --git a/src/backend/utils/adt/tsquery.c b/src/backend/utils/adt/tsquery.c new file mode 100644 index 00000000000..1f8abb3298e --- /dev/null +++ b/src/backend/utils/adt/tsquery.c @@ -0,0 +1,767 @@ +/*------------------------------------------------------------------------- + * + * tsquery.c + * I/O functions for tsquery + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "libpq/pqformat.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" +#include "utils/memutils.h" +#include "utils/pg_crc.h" + +/* parser's states */ +#define WAITOPERAND 1 +#define WAITOPERATOR 2 +#define WAITFIRSTOPERAND 3 +#define WAITSINGLEOPERAND 4 + +/* + * node of query tree, also used + * for storing polish notation in parser + */ +typedef struct ParseQueryNode +{ + int2 weight; + int2 type; + int4 val; + int2 distance; + int2 length; + struct ParseQueryNode *next; +} ParseQueryNode; + +static char * +get_weight(char *buf, int2 *weight) +{ + *weight = 0; + + if (!t_iseq(buf, ':')) + return buf; + + buf++; + while (*buf && pg_mblen(buf) == 1) + { + switch (*buf) + { + case 'a': + case 'A': + *weight |= 1 << 3; + break; + case 'b': + case 'B': + *weight |= 1 << 2; + break; + case 'c': + case 'C': + *weight |= 1 << 1; + break; + case 'd': + case 'D': + *weight |= 1; + break; + default: + return buf; + } + buf++; + } + + return buf; +} + +/* + * get token from query string + */ +static int4 +gettoken_query(TSQueryParserState * state, int4 *val, int4 *lenval, char **strval, int2 *weight) +{ + while (1) + { + switch (state->state) + { + case WAITFIRSTOPERAND: + case WAITOPERAND: + if (t_iseq(state->buf, '!')) + { + (state->buf)++; /* can safely ++, t_iseq guarantee + * that pg_mblen()==1 */ + *val = (int4) '!'; + state->state = WAITOPERAND; + return OPR; + } + else if (t_iseq(state->buf, '(')) + { + state->count++; + (state->buf)++; + state->state = WAITOPERAND; + return OPEN; + } + else if (t_iseq(state->buf, ':')) + { + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error at start of operand in tsearch query: \"%s\"", + state->buffer))); + } + else if (!t_isspace(state->buf)) + { + state->valstate.prsbuf = state->buf; + if (gettoken_tsvector(&(state->valstate))) + { + *strval = state->valstate.word; + *lenval = state->valstate.curpos - state->valstate.word; + state->buf = get_weight(state->valstate.prsbuf, weight); + state->state = WAITOPERATOR; + return VAL; + } + else if (state->state == WAITFIRSTOPERAND) + return END; + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("no operand in tsearch query: \"%s\"", + state->buffer))); + } + break; + case WAITOPERATOR: + if (t_iseq(state->buf, '&') || t_iseq(state->buf, '|')) + { + state->state = WAITOPERAND; + *val = (int4) *(state->buf); + (state->buf)++; + return OPR; + } + else if (t_iseq(state->buf, ')')) + { + (state->buf)++; + state->count--; + return (state->count < 0) ? ERR : CLOSE; + } + else if (*(state->buf) == '\0') + return (state->count) ? ERR : END; + else if (!t_isspace(state->buf)) + return ERR; + break; + case WAITSINGLEOPERAND: + if (*(state->buf) == '\0') + return END; + *strval = state->buf; + *lenval = strlen(state->buf); + state->buf += strlen(state->buf); + state->count++; + return VAL; + default: + return ERR; + break; + } + state->buf += pg_mblen(state->buf); + } + return END; +} + +/* + * push new one in polish notation reverse view + */ +void +pushquery(TSQueryParserState * state, int4 type, int4 val, int4 distance, int4 lenval, int2 weight) +{ + ParseQueryNode *tmp = (ParseQueryNode *) palloc(sizeof(ParseQueryNode)); + + tmp->weight = weight; + tmp->type = type; + tmp->val = val; + if (distance >= MAXSTRPOS) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("value is too big in tsearch query: \"%s\"", + state->buffer))); + if (lenval >= MAXSTRLEN) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("operand is too long in tsearch query: \"%s\"", + state->buffer))); + tmp->distance = distance; + tmp->length = lenval; + tmp->next = state->str; + state->str = tmp; + state->num++; +} + +/* + * This function is used for tsquery parsing + */ +void +pushval_asis(TSQueryParserState * state, int type, char *strval, int lenval, int2 weight) +{ + pg_crc32 c; + + if (lenval >= MAXSTRLEN) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("word is too long in tsearch query: \"%s\"", + state->buffer))); + + INIT_CRC32(c); + COMP_CRC32(c, strval, lenval); + FIN_CRC32(c); + pushquery(state, type, *(int4 *) &c, + state->curop - state->op, lenval, weight); + + while (state->curop - state->op + lenval + 1 >= state->lenop) + { + int4 tmp = state->curop - state->op; + + state->lenop *= 2; + state->op = (char *) repalloc((void *) state->op, state->lenop); + state->curop = state->op + tmp; + } + memcpy((void *) state->curop, (void *) strval, lenval); + state->curop += lenval; + *(state->curop) = '\0'; + state->curop++; + state->sumlen += lenval + 1 /* \0 */ ; + return; +} + +#define STACKDEPTH 32 +/* + * make polish notation of query + */ +static int4 +makepol(TSQueryParserState * state, void (*pushval) (TSQueryParserState *, int, char *, int, int2)) +{ + int4 val = 0, + type; + int4 lenval = 0; + char *strval = NULL; + int4 stack[STACKDEPTH]; + int4 lenstack = 0; + int2 weight = 0; + + while ((type = gettoken_query(state, &val, &lenval, &strval, &weight)) != END) + { + switch (type) + { + case VAL: + pushval(state, VAL, strval, lenval, weight); + while (lenstack && (stack[lenstack - 1] == (int4) '&' || + stack[lenstack - 1] == (int4) '!')) + { + lenstack--; + pushquery(state, OPR, stack[lenstack], 0, 0, 0); + } + break; + case OPR: + if (lenstack && val == (int4) '|') + pushquery(state, OPR, val, 0, 0, 0); + else + { + if (lenstack == STACKDEPTH) /* internal error */ + elog(ERROR, "tsquery stack too small"); + stack[lenstack] = val; + lenstack++; + } + break; + case OPEN: + if (makepol(state, pushval) == ERR) + return ERR; + if (lenstack && (stack[lenstack - 1] == (int4) '&' || + stack[lenstack - 1] == (int4) '!')) + { + lenstack--; + pushquery(state, OPR, stack[lenstack], 0, 0, 0); + } + break; + case CLOSE: + while (lenstack) + { + lenstack--; + pushquery(state, OPR, stack[lenstack], 0, 0, 0); + }; + return END; + break; + case ERR: + default: + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsearch query: \"%s\"", + state->buffer))); + return ERR; + + } + } + while (lenstack) + { + lenstack--; + pushquery(state, OPR, stack[lenstack], 0, 0, 0); + }; + return END; +} + +static void +findoprnd(QueryItem * ptr, int4 *pos) +{ + if (ptr[*pos].type == VAL || ptr[*pos].type == VALSTOP) + { + ptr[*pos].left = 0; + (*pos)++; + } + else if (ptr[*pos].val == (int4) '!') + { + ptr[*pos].left = 1; + (*pos)++; + findoprnd(ptr, pos); + } + else + { + QueryItem *curitem = &ptr[*pos]; + int4 tmp = *pos; + + (*pos)++; + findoprnd(ptr, pos); + curitem->left = *pos - tmp; + findoprnd(ptr, pos); + } +} + + +/* + * input + */ +TSQuery +parse_tsquery(char *buf, void (*pushval) (TSQueryParserState *, int, char *, int, int2), Oid cfg_id, bool isplain) +{ + TSQueryParserState state; + int4 i; + TSQuery query; + int4 commonlen; + QueryItem *ptr; + ParseQueryNode *tmp; + int4 pos = 0; + + /* init state */ + state.buffer = buf; + state.buf = buf; + state.state = (isplain) ? WAITSINGLEOPERAND : WAITFIRSTOPERAND; + state.count = 0; + state.num = 0; + state.str = NULL; + state.cfg_id = cfg_id; + + /* init value parser's state */ + state.valstate.oprisdelim = true; + state.valstate.len = 32; + state.valstate.word = (char *) palloc(state.valstate.len); + + /* init list of operand */ + state.sumlen = 0; + state.lenop = 64; + state.curop = state.op = (char *) palloc(state.lenop); + *(state.curop) = '\0'; + + /* parse query & make polish notation (postfix, but in reverse order) */ + makepol(&state, pushval); + pfree(state.valstate.word); + if (!state.num) + { + ereport(NOTICE, + (errmsg("tsearch query doesn't contain lexeme(s): \"%s\"", + state.buffer))); + query = (TSQuery) palloc(HDRSIZETQ); + SET_VARSIZE(query, HDRSIZETQ); + query->size = 0; + return query; + } + + /* make finish struct */ + commonlen = COMPUTESIZE(state.num, state.sumlen); + query = (TSQuery) palloc(commonlen); + SET_VARSIZE(query, commonlen); + query->size = state.num; + ptr = GETQUERY(query); + + /* set item in polish notation */ + for (i = 0; i < state.num; i++) + { + ptr[i].weight = state.str->weight; + ptr[i].type = state.str->type; + ptr[i].val = state.str->val; + ptr[i].distance = state.str->distance; + ptr[i].length = state.str->length; + tmp = state.str->next; + pfree(state.str); + state.str = tmp; + } + + /* set user friendly-operand view */ + memcpy((void *) GETOPERAND(query), (void *) state.op, state.sumlen); + pfree(state.op); + + /* set left operand's position for every operator */ + pos = 0; + findoprnd(ptr, &pos); + + return query; +} + +/* + * in without morphology + */ +Datum +tsqueryin(PG_FUNCTION_ARGS) +{ + char *in = PG_GETARG_CSTRING(0); + + pg_verifymbstr(in, strlen(in), false); + + PG_RETURN_TSQUERY(parse_tsquery(in, pushval_asis, InvalidOid, false)); +} + +/* + * out function + */ +typedef struct +{ + QueryItem *curpol; + char *buf; + char *cur; + char *op; + int4 buflen; +} INFIX; + +#define RESIZEBUF(inf,addsize) \ +while( ( (inf)->cur - (inf)->buf ) + (addsize) + 1 >= (inf)->buflen ) \ +{ \ + int4 len = (inf)->cur - (inf)->buf; \ + (inf)->buflen *= 2; \ + (inf)->buf = (char*) repalloc( (void*)(inf)->buf, (inf)->buflen ); \ + (inf)->cur = (inf)->buf + len; \ +} + +/* + * recursive walk on tree and print it in + * infix (human-readable) view + */ +static void +infix(INFIX * in, bool first) +{ + if (in->curpol->type == VAL) + { + char *op = in->op + in->curpol->distance; + int clen; + + RESIZEBUF(in, in->curpol->length * (pg_database_encoding_max_length() + 1) + 2 + 5); + *(in->cur) = '\''; + in->cur++; + while (*op) + { + if (t_iseq(op, '\'')) + { + *(in->cur) = '\''; + in->cur++; + } + COPYCHAR(in->cur, op); + + clen = pg_mblen(op); + op += clen; + in->cur += clen; + } + *(in->cur) = '\''; + in->cur++; + if (in->curpol->weight) + { + *(in->cur) = ':'; + in->cur++; + if (in->curpol->weight & (1 << 3)) + { + *(in->cur) = 'A'; + in->cur++; + } + if (in->curpol->weight & (1 << 2)) + { + *(in->cur) = 'B'; + in->cur++; + } + if (in->curpol->weight & (1 << 1)) + { + *(in->cur) = 'C'; + in->cur++; + } + if (in->curpol->weight & 1) + { + *(in->cur) = 'D'; + in->cur++; + } + } + *(in->cur) = '\0'; + in->curpol++; + } + else if (in->curpol->val == (int4) '!') + { + bool isopr = false; + + RESIZEBUF(in, 1); + *(in->cur) = '!'; + in->cur++; + *(in->cur) = '\0'; + in->curpol++; + if (in->curpol->type == OPR) + { + isopr = true; + RESIZEBUF(in, 2); + sprintf(in->cur, "( "); + in->cur = strchr(in->cur, '\0'); + } + infix(in, isopr); + if (isopr) + { + RESIZEBUF(in, 2); + sprintf(in->cur, " )"); + in->cur = strchr(in->cur, '\0'); + } + } + else + { + int4 op = in->curpol->val; + INFIX nrm; + + in->curpol++; + if (op == (int4) '|' && !first) + { + RESIZEBUF(in, 2); + sprintf(in->cur, "( "); + in->cur = strchr(in->cur, '\0'); + } + + nrm.curpol = in->curpol; + nrm.op = in->op; + nrm.buflen = 16; + nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); + + /* get right operand */ + infix(&nrm, false); + + /* get & print left operand */ + in->curpol = nrm.curpol; + infix(in, false); + + /* print operator & right operand */ + RESIZEBUF(in, 3 + (nrm.cur - nrm.buf)); + sprintf(in->cur, " %c %s", op, nrm.buf); + in->cur = strchr(in->cur, '\0'); + pfree(nrm.buf); + + if (op == (int4) '|' && !first) + { + RESIZEBUF(in, 2); + sprintf(in->cur, " )"); + in->cur = strchr(in->cur, '\0'); + } + } +} + + +Datum +tsqueryout(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY(0); + INFIX nrm; + + if (query->size == 0) + { + char *b = palloc(1); + + *b = '\0'; + PG_RETURN_POINTER(b); + } + nrm.curpol = GETQUERY(query); + nrm.buflen = 32; + nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); + *(nrm.cur) = '\0'; + nrm.op = GETOPERAND(query); + infix(&nrm, true); + + PG_FREE_IF_COPY(query, 0); + PG_RETURN_CSTRING(nrm.buf); +} + +Datum +tsquerysend(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY(0); + StringInfoData buf; + int i; + QueryItem *item = GETQUERY(query); + + pq_begintypsend(&buf); + + pq_sendint(&buf, query->size, sizeof(int32)); + for (i = 0; i < query->size; i++) + { + int tmp; + + pq_sendint(&buf, item->type, sizeof(item->type)); + pq_sendint(&buf, item->weight, sizeof(item->weight)); + pq_sendint(&buf, item->left, sizeof(item->left)); + pq_sendint(&buf, item->val, sizeof(item->val)); + + /* + * We are sure that sizeof(WordEntry) == sizeof(int32), and about + * layout of QueryItem + */ + tmp = *(int32 *) (((char *) item) + HDRSIZEQI); + pq_sendint(&buf, tmp, sizeof(tmp)); + + item++; + } + + item = GETQUERY(query); + for (i = 0; i < query->size; i++) + { + if (item->type == VAL) + pq_sendbytes(&buf, GETOPERAND(query) + item->distance, item->length); + item++; + } + + PG_FREE_IF_COPY(query, 0); + + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); +} + +Datum +tsqueryrecv(PG_FUNCTION_ARGS) +{ + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + TSQuery query; + int i, + size, + tmp, + len = HDRSIZETQ; + QueryItem *item; + int datalen = 0; + char *ptr; + + size = pq_getmsgint(buf, sizeof(uint32)); + if (size < 0 || size > (MaxAllocSize / sizeof(QueryItem))) + elog(ERROR, "invalid size of tsquery"); + len += sizeof(QueryItem) * size; + + query = (TSQuery) palloc(len); + query->size = size; + item = GETQUERY(query); + + for (i = 0; i < size; i++) + { + item->type = (int8) pq_getmsgint(buf, sizeof(int8)); + item->weight = (int8) pq_getmsgint(buf, sizeof(int8)); + item->left = (int16) pq_getmsgint(buf, sizeof(int16)); + item->val = (int32) pq_getmsgint(buf, sizeof(int32)); + tmp = pq_getmsgint(buf, sizeof(int32)); + memcpy((((char *) item) + HDRSIZEQI), &tmp, sizeof(int32)); + + /* + * Sanity checks + */ + if (item->type == VAL) + { + datalen += item->length + 1; /* \0 */ + } + else if (item->type == OPR) + { + if (item->val == '|' || item->val == '&') + { + if (item->left <= 0 || i + item->left >= size) + elog(ERROR, "invalid pointer to left operand"); + } + + if (i == size - 1) + elog(ERROR, "invalid pointer to right operand"); + } + else + elog(ERROR, "unknown tsquery node type"); + + item++; + } + + query = (TSQuery) repalloc(query, len + datalen); + + item = GETQUERY(query); + ptr = GETOPERAND(query); + for (i = 0; i < size; i++) + { + if (item->type == VAL) + { + item->distance = ptr - GETOPERAND(query); + memcpy(ptr, + pq_getmsgbytes(buf, item->length), + item->length); + ptr += item->length; + *ptr++ = '\0'; + } + item++; + } + + Assert(ptr - GETOPERAND(query) == datalen); + + SET_VARSIZE(query, len + datalen); + + PG_RETURN_TSVECTOR(query); +} + +/* + * debug function, used only for view query + * which will be executed in non-leaf pages in index + */ +Datum +tsquerytree(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY(0); + INFIX nrm; + text *res; + QueryItem *q; + int4 len; + + if (query->size == 0) + { + res = (text *) palloc(VARHDRSZ); + SET_VARSIZE(res, VARHDRSZ); + PG_RETURN_POINTER(res); + } + + q = clean_NOT(GETQUERY(query), &len); + + if (!q) + { + res = (text *) palloc(1 + VARHDRSZ); + SET_VARSIZE(res, 1 + VARHDRSZ); + *((char *) VARDATA(res)) = 'T'; + } + else + { + nrm.curpol = q; + nrm.buflen = 32; + nrm.cur = nrm.buf = (char *) palloc(sizeof(char) * nrm.buflen); + *(nrm.cur) = '\0'; + nrm.op = GETOPERAND(query); + infix(&nrm, true); + + res = (text *) palloc(nrm.cur - nrm.buf + VARHDRSZ); + SET_VARSIZE(res, nrm.cur - nrm.buf + VARHDRSZ); + strncpy(VARDATA(res), nrm.buf, nrm.cur - nrm.buf); + pfree(q); + } + + PG_FREE_IF_COPY(query, 0); + + PG_RETURN_POINTER(res); +} diff --git a/src/backend/utils/adt/tsquery_cleanup.c b/src/backend/utils/adt/tsquery_cleanup.c new file mode 100644 index 00000000000..7991a4ad198 --- /dev/null +++ b/src/backend/utils/adt/tsquery_cleanup.c @@ -0,0 +1,261 @@ +/*------------------------------------------------------------------------- + * + * tsquery_cleanup.c + * Cleanup query from NOT values and/or stopword + * Utility functions to correct work. + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_cleanup.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" + +typedef struct NODE +{ + struct NODE *left; + struct NODE *right; + QueryItem *valnode; +} NODE; + +/* + * make query tree from plain view of query + */ +static NODE * +maketree(QueryItem * in) +{ + NODE *node = (NODE *) palloc(sizeof(NODE)); + + node->valnode = in; + node->right = node->left = NULL; + if (in->type == OPR) + { + node->right = maketree(in + 1); + if (in->val != (int4) '!') + node->left = maketree(in + in->left); + } + return node; +} + +typedef struct +{ + QueryItem *ptr; + int4 len; + int4 cur; +} PLAINTREE; + +static void +plainnode(PLAINTREE * state, NODE * node) +{ + if (state->cur == state->len) + { + state->len *= 2; + state->ptr = (QueryItem *) repalloc((void *) state->ptr, state->len * sizeof(QueryItem)); + } + memcpy((void *) &(state->ptr[state->cur]), (void *) node->valnode, sizeof(QueryItem)); + if (node->valnode->type == VAL) + state->cur++; + else if (node->valnode->val == (int4) '!') + { + state->ptr[state->cur].left = 1; + state->cur++; + plainnode(state, node->right); + } + else + { + int4 cur = state->cur; + + state->cur++; + plainnode(state, node->right); + state->ptr[cur].left = state->cur - cur; + plainnode(state, node->left); + } + pfree(node); +} + +/* + * make plain view of tree from 'normal' view of tree + */ +static QueryItem * +plaintree(NODE * root, int4 *len) +{ + PLAINTREE pl; + + pl.cur = 0; + pl.len = 16; + if (root && (root->valnode->type == VAL || root->valnode->type == OPR)) + { + pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem)); + plainnode(&pl, root); + } + else + pl.ptr = NULL; + *len = pl.cur; + return pl.ptr; +} + +static void +freetree(NODE * node) +{ + if (!node) + return; + if (node->left) + freetree(node->left); + if (node->right) + freetree(node->right); + pfree(node); +} + +/* + * clean tree for ! operator. + * It's usefull for debug, but in + * other case, such view is used with search in index. + * Operator ! always return TRUE + */ +static NODE * +clean_NOT_intree(NODE * node) +{ + if (node->valnode->type == VAL) + return node; + + if (node->valnode->val == (int4) '!') + { + freetree(node); + return NULL; + } + + /* operator & or | */ + if (node->valnode->val == (int4) '|') + { + if ((node->left = clean_NOT_intree(node->left)) == NULL || + (node->right = clean_NOT_intree(node->right)) == NULL) + { + freetree(node); + return NULL; + } + } + else + { + NODE *res = node; + + node->left = clean_NOT_intree(node->left); + node->right = clean_NOT_intree(node->right); + if (node->left == NULL && node->right == NULL) + { + pfree(node); + res = NULL; + } + else if (node->left == NULL) + { + res = node->right; + pfree(node); + } + else if (node->right == NULL) + { + res = node->left; + pfree(node); + } + return res; + } + return node; +} + +QueryItem * +clean_NOT(QueryItem * ptr, int4 *len) +{ + NODE *root = maketree(ptr); + + return plaintree(clean_NOT_intree(root), len); +} + + +#ifdef V_UNKNOWN /* exists in Windows headers */ +#undef V_UNKNOWN +#endif + +#define V_UNKNOWN 0 +#define V_TRUE 1 +#define V_FALSE 2 +#define V_STOP 3 + +/* + * Clean query tree from values which is always in + * text (stopword) + */ +static NODE * +clean_fakeval_intree(NODE * node, char *result) +{ + char lresult = V_UNKNOWN, + rresult = V_UNKNOWN; + + if (node->valnode->type == VAL) + return node; + else if (node->valnode->type == VALSTOP) + { + pfree(node); + *result = V_STOP; + return NULL; + } + + + if (node->valnode->val == (int4) '!') + { + node->right = clean_fakeval_intree(node->right, &rresult); + if (!node->right) + { + *result = V_STOP; + freetree(node); + return NULL; + } + } + else + { + NODE *res = node; + + node->left = clean_fakeval_intree(node->left, &lresult); + node->right = clean_fakeval_intree(node->right, &rresult); + if (lresult == V_STOP && rresult == V_STOP) + { + freetree(node); + *result = V_STOP; + return NULL; + } + else if (lresult == V_STOP) + { + res = node->right; + pfree(node); + } + else if (rresult == V_STOP) + { + res = node->left; + pfree(node); + } + return res; + } + return node; +} + +QueryItem * +clean_fakeval(QueryItem * ptr, int4 *len) +{ + NODE *root = maketree(ptr); + char result = V_UNKNOWN; + NODE *resroot; + + resroot = clean_fakeval_intree(root, &result); + if (result != V_UNKNOWN) + { + elog(NOTICE, "query contains only stopword(s) or doesn't contain lexeme(s), ignored"); + *len = 0; + return NULL; + } + + return plaintree(resroot, len); +} diff --git a/src/backend/utils/adt/tsquery_gist.c b/src/backend/utils/adt/tsquery_gist.c new file mode 100644 index 00000000000..0deca10075c --- /dev/null +++ b/src/backend/utils/adt/tsquery_gist.c @@ -0,0 +1,259 @@ +/*------------------------------------------------------------------------- + * + * tsquery_gist.c + * GiST index support for tsquery + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_gist.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/skey.h" +#include "access/gist.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" + +#define GETENTRY(vec,pos) ((TSQuerySign *) DatumGetPointer((vec)->vector[(pos)].key)) + +Datum +gtsquery_compress(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + GISTENTRY *retval = entry; + + if (entry->leafkey) + { + TSQuerySign *sign = (TSQuerySign *) palloc(sizeof(TSQuerySign)); + + retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); + *sign = makeTSQuerySign(DatumGetTSQuery(entry->key)); + + gistentryinit(*retval, PointerGetDatum(sign), + entry->rel, entry->page, + entry->offset, FALSE); + } + + PG_RETURN_POINTER(retval); +} + +Datum +gtsquery_decompress(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(PG_GETARG_DATUM(0)); +} + +Datum +gtsquery_consistent(PG_FUNCTION_ARGS) +{ + GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + TSQuerySign *key = (TSQuerySign *) DatumGetPointer(entry->key); + TSQuery query = PG_GETARG_TSQUERY(1); + StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); + TSQuerySign sq = makeTSQuerySign(query); + bool retval; + + switch (strategy) + { + case RTContainsStrategyNumber: + if (GIST_LEAF(entry)) + retval = (*key & sq) == sq; + else + retval = (*key & sq) != 0; + break; + case RTContainedByStrategyNumber: + if (GIST_LEAF(entry)) + retval = (*key & sq) == *key; + else + retval = (*key & sq) != 0; + break; + default: + retval = FALSE; + } + PG_RETURN_BOOL(retval); +} + +Datum +gtsquery_union(PG_FUNCTION_ARGS) +{ + GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); + int *size = (int *) PG_GETARG_POINTER(1); + TSQuerySign *sign = (TSQuerySign *) palloc(sizeof(TSQuerySign)); + int i; + + memset(sign, 0, sizeof(TSQuerySign)); + + for (i = 0; i < entryvec->n; i++) + *sign |= *GETENTRY(entryvec, i); + + *size = sizeof(TSQuerySign); + + PG_RETURN_POINTER(sign); +} + +Datum +gtsquery_same(PG_FUNCTION_ARGS) +{ + TSQuerySign *a = (TSQuerySign *) PG_GETARG_POINTER(0); + TSQuerySign *b = (TSQuerySign *) PG_GETARG_POINTER(1); + + PG_RETURN_POINTER(*a == *b); +} + +static int +sizebitvec(TSQuerySign sign) +{ + int size = 0, + i; + + for (i = 0; i < TSQS_SIGLEN; i++) + size += 0x01 & (sign >> i); + + return size; +} + +static int +hemdist(TSQuerySign a, TSQuerySign b) +{ + TSQuerySign res = a ^ b; + + return sizebitvec(res); +} + +Datum +gtsquery_penalty(PG_FUNCTION_ARGS) +{ + TSQuerySign *origval = (TSQuerySign *) DatumGetPointer(((GISTENTRY *) PG_GETARG_POINTER(0))->key); + TSQuerySign *newval = (TSQuerySign *) DatumGetPointer(((GISTENTRY *) PG_GETARG_POINTER(1))->key); + float *penalty = (float *) PG_GETARG_POINTER(2); + + *penalty = hemdist(*origval, *newval); + + PG_RETURN_POINTER(penalty); +} + + +typedef struct +{ + OffsetNumber pos; + int4 cost; +} SPLITCOST; + +static int +comparecost(const void *a, const void *b) +{ + if (((SPLITCOST *) a)->cost == ((SPLITCOST *) b)->cost) + return 0; + else + return (((SPLITCOST *) a)->cost > ((SPLITCOST *) b)->cost) ? 1 : -1; +} + +#define WISH_F(a,b,c) (double)( -(double)(((a)-(b))*((a)-(b))*((a)-(b)))*(c) ) + +Datum +gtsquery_picksplit(PG_FUNCTION_ARGS) +{ + GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); + GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1); + OffsetNumber maxoff = entryvec->n - 2; + OffsetNumber k, + j; + + TSQuerySign *datum_l, + *datum_r; + int4 size_alpha, + size_beta; + int4 size_waste, + waste = -1; + int4 nbytes; + OffsetNumber seed_1 = 0, + seed_2 = 0; + OffsetNumber *left, + *right; + + SPLITCOST *costvector; + + nbytes = (maxoff + 2) * sizeof(OffsetNumber); + left = v->spl_left = (OffsetNumber *) palloc(nbytes); + right = v->spl_right = (OffsetNumber *) palloc(nbytes); + v->spl_nleft = v->spl_nright = 0; + + for (k = FirstOffsetNumber; k < maxoff; k = OffsetNumberNext(k)) + for (j = OffsetNumberNext(k); j <= maxoff; j = OffsetNumberNext(j)) + { + size_waste = hemdist(*GETENTRY(entryvec, j), *GETENTRY(entryvec, k)); + if (size_waste > waste) + { + waste = size_waste; + seed_1 = k; + seed_2 = j; + } + } + + + if (seed_1 == 0 || seed_2 == 0) + { + seed_1 = 1; + seed_2 = 2; + } + + datum_l = (TSQuerySign *) palloc(sizeof(TSQuerySign)); + *datum_l = *GETENTRY(entryvec, seed_1); + datum_r = (TSQuerySign *) palloc(sizeof(TSQuerySign)); + *datum_r = *GETENTRY(entryvec, seed_2); + + + maxoff = OffsetNumberNext(maxoff); + costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff); + for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j)) + { + costvector[j - 1].pos = j; + size_alpha = hemdist(*GETENTRY(entryvec, seed_1), *GETENTRY(entryvec, j)); + size_beta = hemdist(*GETENTRY(entryvec, seed_2), *GETENTRY(entryvec, j)); + costvector[j - 1].cost = abs(size_alpha - size_beta); + } + qsort((void *) costvector, maxoff, sizeof(SPLITCOST), comparecost); + + for (k = 0; k < maxoff; k++) + { + j = costvector[k].pos; + if (j == seed_1) + { + *left++ = j; + v->spl_nleft++; + continue; + } + else if (j == seed_2) + { + *right++ = j; + v->spl_nright++; + continue; + } + size_alpha = hemdist(*datum_l, *GETENTRY(entryvec, j)); + size_beta = hemdist(*datum_r, *GETENTRY(entryvec, j)); + + if (size_alpha < size_beta + WISH_F(v->spl_nleft, v->spl_nright, 0.05)) + { + *datum_l |= *GETENTRY(entryvec, j); + *left++ = j; + v->spl_nleft++; + } + else + { + *datum_r |= *GETENTRY(entryvec, j); + *right++ = j; + v->spl_nright++; + } + } + + *right = *left = FirstOffsetNumber; + v->spl_ldatum = PointerGetDatum(datum_l); + v->spl_rdatum = PointerGetDatum(datum_r); + + PG_RETURN_POINTER(v); +} diff --git a/src/backend/utils/adt/tsquery_op.c b/src/backend/utils/adt/tsquery_op.c new file mode 100644 index 00000000000..fd97c2796df --- /dev/null +++ b/src/backend/utils/adt/tsquery_op.c @@ -0,0 +1,289 @@ +/*------------------------------------------------------------------------- + * + * tsquery_op.c + * Various operations with tsquery + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_op.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "tsearch/ts_type.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_utils.h" +#include "utils/pg_crc.h" + +Datum +tsquery_numnode(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY(0); + int nnode = query->size; + + PG_FREE_IF_COPY(query, 0); + PG_RETURN_INT32(nnode); +} + +static QTNode * +join_tsqueries(TSQuery a, TSQuery b) +{ + QTNode *res = (QTNode *) palloc0(sizeof(QTNode)); + + res->flags |= QTN_NEEDFREE; + + res->valnode = (QueryItem *) palloc0(sizeof(QueryItem)); + res->valnode->type = OPR; + + res->child = (QTNode **) palloc0(sizeof(QTNode *) * 2); + res->child[0] = QT2QTN(GETQUERY(b), GETOPERAND(b)); + res->child[1] = QT2QTN(GETQUERY(a), GETOPERAND(a)); + res->nchild = 2; + + return res; +} + +Datum +tsquery_and(PG_FUNCTION_ARGS) +{ + TSQuery a = PG_GETARG_TSQUERY_COPY(0); + TSQuery b = PG_GETARG_TSQUERY_COPY(1); + QTNode *res; + TSQuery query; + + if (a->size == 0) + { + PG_FREE_IF_COPY(a, 1); + PG_RETURN_POINTER(b); + } + else if (b->size == 0) + { + PG_FREE_IF_COPY(b, 1); + PG_RETURN_POINTER(a); + } + + res = join_tsqueries(a, b); + + res->valnode->val = '&'; + + query = QTN2QT(res); + + QTNFree(res); + PG_FREE_IF_COPY(a, 0); + PG_FREE_IF_COPY(b, 1); + + PG_RETURN_TSQUERY(query); +} + +Datum +tsquery_or(PG_FUNCTION_ARGS) +{ + TSQuery a = PG_GETARG_TSQUERY_COPY(0); + TSQuery b = PG_GETARG_TSQUERY_COPY(1); + QTNode *res; + TSQuery query; + + if (a->size == 0) + { + PG_FREE_IF_COPY(a, 1); + PG_RETURN_POINTER(b); + } + else if (b->size == 0) + { + PG_FREE_IF_COPY(b, 1); + PG_RETURN_POINTER(a); + } + + res = join_tsqueries(a, b); + + res->valnode->val = '|'; + + query = QTN2QT(res); + + QTNFree(res); + PG_FREE_IF_COPY(a, 0); + PG_FREE_IF_COPY(b, 1); + + PG_RETURN_POINTER(query); +} + +Datum +tsquery_not(PG_FUNCTION_ARGS) +{ + TSQuery a = PG_GETARG_TSQUERY_COPY(0); + QTNode *res; + TSQuery query; + + if (a->size == 0) + PG_RETURN_POINTER(a); + + res = (QTNode *) palloc0(sizeof(QTNode)); + + res->flags |= QTN_NEEDFREE; + + res->valnode = (QueryItem *) palloc0(sizeof(QueryItem)); + res->valnode->type = OPR; + res->valnode->val = '!'; + + res->child = (QTNode **) palloc0(sizeof(QTNode *)); + res->child[0] = QT2QTN(GETQUERY(a), GETOPERAND(a)); + res->nchild = 1; + + query = QTN2QT(res); + + QTNFree(res); + PG_FREE_IF_COPY(a, 0); + + PG_RETURN_POINTER(query); +} + +static int +CompareTSQ(TSQuery a, TSQuery b) +{ + if (a->size != b->size) + { + return (a->size < b->size) ? -1 : 1; + } + else if (VARSIZE(a) != VARSIZE(b)) + { + return (VARSIZE(a) < VARSIZE(b)) ? -1 : 1; + } + else + { + QTNode *an = QT2QTN(GETQUERY(a), GETOPERAND(a)); + QTNode *bn = QT2QTN(GETQUERY(b), GETOPERAND(b)); + int res = QTNodeCompare(an, bn); + + QTNFree(an); + QTNFree(bn); + + return res; + } + + return 0; +} + +Datum +tsquery_cmp(PG_FUNCTION_ARGS) +{ + TSQuery a = PG_GETARG_TSQUERY_COPY(0); + TSQuery b = PG_GETARG_TSQUERY_COPY(1); + int res = CompareTSQ(a, b); + + PG_FREE_IF_COPY(a, 0); + PG_FREE_IF_COPY(b, 1); + + PG_RETURN_INT32(res); +} + +#define CMPFUNC( NAME, CONDITION ) \ +Datum \ +NAME(PG_FUNCTION_ARGS) { \ + TSQuery a = PG_GETARG_TSQUERY_COPY(0); \ + TSQuery b = PG_GETARG_TSQUERY_COPY(1); \ + int res = CompareTSQ(a,b); \ + \ + PG_FREE_IF_COPY(a,0); \ + PG_FREE_IF_COPY(b,1); \ + \ + PG_RETURN_BOOL( CONDITION ); \ +} + +CMPFUNC(tsquery_lt, res < 0); +CMPFUNC(tsquery_le, res <= 0); +CMPFUNC(tsquery_eq, res == 0); +CMPFUNC(tsquery_ge, res >= 0); +CMPFUNC(tsquery_gt, res > 0); +CMPFUNC(tsquery_ne, res != 0); + +TSQuerySign +makeTSQuerySign(TSQuery a) +{ + int i; + QueryItem *ptr = GETQUERY(a); + TSQuerySign sign = 0; + + for (i = 0; i < a->size; i++) + { + if (ptr->type == VAL) + sign |= ((TSQuerySign) 1) << (ptr->val % TSQS_SIGLEN); + ptr++; + } + + return sign; +} + +Datum +tsq_mcontains(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY(0); + TSQuery ex = PG_GETARG_TSQUERY(1); + TSQuerySign sq, + se; + int i, + j; + QueryItem *iq, + *ie; + + if (query->size < ex->size) + { + PG_FREE_IF_COPY(query, 0); + PG_FREE_IF_COPY(ex, 1); + + PG_RETURN_BOOL(false); + } + + sq = makeTSQuerySign(query); + se = makeTSQuerySign(ex); + + if ((sq & se) != se) + { + PG_FREE_IF_COPY(query, 0); + PG_FREE_IF_COPY(ex, 1); + + PG_RETURN_BOOL(false); + } + + ie = GETQUERY(ex); + + for (i = 0; i < ex->size; i++) + { + iq = GETQUERY(query); + if (ie[i].type != VAL) + continue; + for (j = 0; j < query->size; j++) + if (iq[j].type == VAL && ie[i].val == iq[j].val) + { + j = query->size + 1; + break; + } + if (j == query->size) + { + PG_FREE_IF_COPY(query, 0); + PG_FREE_IF_COPY(ex, 1); + + PG_RETURN_BOOL(false); + } + } + + PG_FREE_IF_COPY(query, 0); + PG_FREE_IF_COPY(ex, 1); + + PG_RETURN_BOOL(true); +} + +Datum +tsq_mcontained(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM( + DirectFunctionCall2( + tsq_mcontains, + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(0) + ) + ); +} diff --git a/src/backend/utils/adt/tsquery_rewrite.c b/src/backend/utils/adt/tsquery_rewrite.c new file mode 100644 index 00000000000..f0d22c644ae --- /dev/null +++ b/src/backend/utils/adt/tsquery_rewrite.c @@ -0,0 +1,524 @@ +/*------------------------------------------------------------------------- + * + * tsquery_rewrite.c + * Utilities for reconstructing tsquery + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_rewrite.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "executor/spi.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" + + +static int +addone(int *counters, int last, int total) +{ + counters[last]++; + if (counters[last] >= total) + { + if (last == 0) + return 0; + if (addone(counters, last - 1, total - 1) == 0) + return 0; + counters[last] = counters[last - 1] + 1; + } + return 1; +} + +static QTNode * +findeq(QTNode *node, QTNode *ex, QTNode *subs, bool *isfind) +{ + + if ((node->sign & ex->sign) != ex->sign || node->valnode->type != ex->valnode->type || node->valnode->val != ex->valnode->val) + return node; + + if (node->flags & QTN_NOCHANGE) + return node; + + if (node->valnode->type == OPR) + { + if (node->nchild == ex->nchild) + { + if (QTNEq(node, ex)) + { + QTNFree(node); + if (subs) + { + node = QTNCopy(subs); + node->flags |= QTN_NOCHANGE; + } + else + node = NULL; + *isfind = true; + } + } + else if (node->nchild > ex->nchild) + { + int *counters = (int *) palloc(sizeof(int) * node->nchild); + int i; + QTNode *tnode = (QTNode *) palloc(sizeof(QTNode)); + + memset(tnode, 0, sizeof(QTNode)); + tnode->child = (QTNode **) palloc(sizeof(QTNode *) * ex->nchild); + tnode->nchild = ex->nchild; + tnode->valnode = (QueryItem *) palloc(sizeof(QueryItem)); + *(tnode->valnode) = *(ex->valnode); + + for (i = 0; i < ex->nchild; i++) + counters[i] = i; + + do + { + tnode->sign = 0; + for (i = 0; i < ex->nchild; i++) + { + tnode->child[i] = node->child[counters[i]]; + tnode->sign |= tnode->child[i]->sign; + } + + if (QTNEq(tnode, ex)) + { + int j = 0; + + pfree(tnode->valnode); + pfree(tnode->child); + pfree(tnode); + if (subs) + { + tnode = QTNCopy(subs); + tnode->flags = QTN_NOCHANGE | QTN_NEEDFREE; + } + else + tnode = NULL; + + node->child[counters[0]] = tnode; + + for (i = 1; i < ex->nchild; i++) + node->child[counters[i]] = NULL; + for (i = 0; i < node->nchild; i++) + { + if (node->child[i]) + { + node->child[j] = node->child[i]; + j++; + } + } + + node->nchild = j; + + *isfind = true; + + break; + } + } while (addone(counters, ex->nchild - 1, node->nchild)); + if (tnode && (tnode->flags & QTN_NOCHANGE) == 0) + { + pfree(tnode->valnode); + pfree(tnode->child); + pfree(tnode); + } + else + QTNSort(node); + pfree(counters); + } + } + else if (QTNEq(node, ex)) + { + QTNFree(node); + if (subs) + { + node = QTNCopy(subs); + node->flags |= QTN_NOCHANGE; + } + else + { + node = NULL; + } + *isfind = true; + } + + return node; +} + +static QTNode * +dofindsubquery(QTNode *root, QTNode *ex, QTNode *subs, bool *isfind) +{ + root = findeq(root, ex, subs, isfind); + + if (root && (root->flags & QTN_NOCHANGE) == 0 && root->valnode->type == OPR) + { + int i; + + for (i = 0; i < root->nchild; i++) + root->child[i] = dofindsubquery(root->child[i], ex, subs, isfind); + } + + return root; +} + +static QTNode * +dropvoidsubtree(QTNode * root) +{ + + if (!root) + return NULL; + + if (root->valnode->type == OPR) + { + int i, + j = 0; + + for (i = 0; i < root->nchild; i++) + { + if (root->child[i]) + { + root->child[j] = root->child[i]; + j++; + } + } + + root->nchild = j; + + if (root->valnode->val == (int4) '!' && root->nchild == 0) + { + QTNFree(root); + root = NULL; + } + else if (root->nchild == 1) + { + QTNode *nroot = root->child[0]; + + pfree(root); + root = nroot; + } + } + + return root; +} + +static QTNode * +findsubquery(QTNode *root, QTNode *ex, QTNode *subs, bool *isfind) +{ + bool DidFind = false; + + root = dofindsubquery(root, ex, subs, &DidFind); + + if (!subs && DidFind) + root = dropvoidsubtree(root); + + if (isfind) + *isfind = DidFind; + + return root; +} + +Datum +ts_rewrite_accum(PG_FUNCTION_ARGS) +{ + TSQuery acc; + ArrayType *qa; + TSQuery q; + QTNode *qex = NULL, + *subs = NULL, + *acctree = NULL; + bool isfind = false; + Datum *elemsp; + int nelemsp; + MemoryContext aggcontext; + MemoryContext oldcontext; + + aggcontext = ((AggState *) fcinfo->context)->aggcontext; + + if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL) + { + acc = (TSQuery) MemoryContextAlloc(aggcontext, HDRSIZETQ); + SET_VARSIZE(acc, HDRSIZETQ); + acc->size = 0; + } + else + acc = PG_GETARG_TSQUERY(0); + + if (PG_ARGISNULL(1) || PG_GETARG_POINTER(1) == NULL) + PG_RETURN_TSQUERY(acc); + else + qa = PG_GETARG_ARRAYTYPE_P_COPY(1); + + if (ARR_NDIM(qa) != 1) + elog(ERROR, "array must be one-dimensional, not %d dimensions", + ARR_NDIM(qa)); + if (ArrayGetNItems(ARR_NDIM(qa), ARR_DIMS(qa)) != 3) + elog(ERROR, "array should have only three elements"); + if (ARR_ELEMTYPE(qa) != TSQUERYOID) + elog(ERROR, "array should contain tsquery type"); + + deconstruct_array(qa, TSQUERYOID, -1, false, 'i', &elemsp, NULL, &nelemsp); + + q = DatumGetTSQuery(elemsp[0]); + if (q->size == 0) + { + pfree(elemsp); + PG_RETURN_POINTER(acc); + } + + if (!acc->size) + { + if (VARSIZE(acc) > HDRSIZETQ) + { + pfree(elemsp); + PG_RETURN_POINTER(acc); + } + else + acctree = QT2QTN(GETQUERY(q), GETOPERAND(q)); + } + else + acctree = QT2QTN(GETQUERY(acc), GETOPERAND(acc)); + + QTNTernary(acctree); + QTNSort(acctree); + + q = DatumGetTSQuery(elemsp[1]); + if (q->size == 0) + { + pfree(elemsp); + PG_RETURN_POINTER(acc); + } + qex = QT2QTN(GETQUERY(q), GETOPERAND(q)); + QTNTernary(qex); + QTNSort(qex); + + q = DatumGetTSQuery(elemsp[2]); + if (q->size) + subs = QT2QTN(GETQUERY(q), GETOPERAND(q)); + + acctree = findsubquery(acctree, qex, subs, &isfind); + + if (isfind || !acc->size) + { + /* pfree( acc ); do not pfree(p), because nodeAgg.c will */ + if (acctree) + { + QTNBinary(acctree); + oldcontext = MemoryContextSwitchTo(aggcontext); + acc = QTN2QT(acctree); + MemoryContextSwitchTo(oldcontext); + } + else + { + acc = (TSQuery) MemoryContextAlloc(aggcontext, HDRSIZETQ); + SET_VARSIZE(acc, HDRSIZETQ); + acc->size = 0; + } + } + + pfree(elemsp); + QTNFree(qex); + QTNFree(subs); + QTNFree(acctree); + + PG_RETURN_TSQUERY(acc); +} + +Datum +ts_rewrite_finish(PG_FUNCTION_ARGS) +{ + TSQuery acc = PG_GETARG_TSQUERY(0); + TSQuery rewrited; + + if (acc == NULL || PG_ARGISNULL(0) || acc->size == 0) + { + rewrited = (TSQuery) palloc(HDRSIZETQ); + SET_VARSIZE(rewrited, HDRSIZETQ); + rewrited->size = 0; + } + else + { + rewrited = (TSQuery) palloc(VARSIZE(acc)); + memcpy(rewrited, acc, VARSIZE(acc)); + pfree(acc); + } + + PG_RETURN_POINTER(rewrited); +} + +Datum +tsquery_rewrite(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY_COPY(0); + text *in = PG_GETARG_TEXT_P(1); + TSQuery rewrited = query; + MemoryContext outercontext = CurrentMemoryContext; + MemoryContext oldcontext; + QTNode *tree; + char *buf; + void *plan; + Portal portal; + bool isnull; + int i; + + if (query->size == 0) + { + PG_FREE_IF_COPY(in, 1); + PG_RETURN_POINTER(rewrited); + } + + tree = QT2QTN(GETQUERY(query), GETOPERAND(query)); + QTNTernary(tree); + QTNSort(tree); + + buf = TextPGetCString(in); + + SPI_connect(); + + if ((plan = SPI_prepare(buf, 0, NULL)) == NULL) + elog(ERROR, "SPI_prepare(\"%s\") failed", buf); + + if ((portal = SPI_cursor_open(NULL, plan, NULL, NULL, false)) == NULL) + elog(ERROR, "SPI_cursor_open(\"%s\") failed", buf); + + SPI_cursor_fetch(portal, true, 100); + + if (SPI_tuptable->tupdesc->natts != 2 || + SPI_gettypeid(SPI_tuptable->tupdesc, 1) != TSQUERYOID || + SPI_gettypeid(SPI_tuptable->tupdesc, 2) != TSQUERYOID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("ts_rewrite query must return two tsquery columns"))); + + while (SPI_processed > 0 && tree) + { + for (i = 0; i < SPI_processed && tree; i++) + { + Datum qdata = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull); + Datum sdata; + + if (isnull) + continue; + + sdata = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 2, &isnull); + + if (!isnull) + { + TSQuery qtex = DatumGetTSQuery(qdata); + TSQuery qtsubs = DatumGetTSQuery(sdata); + QTNode *qex, + *qsubs = NULL; + + if (qtex->size == 0) + { + if (qtex != (TSQuery) DatumGetPointer(qdata)) + pfree(qtex); + if (qtsubs != (TSQuery) DatumGetPointer(sdata)) + pfree(qtsubs); + continue; + } + + qex = QT2QTN(GETQUERY(qtex), GETOPERAND(qtex)); + + QTNTernary(qex); + QTNSort(qex); + + if (qtsubs->size) + qsubs = QT2QTN(GETQUERY(qtsubs), GETOPERAND(qtsubs)); + + oldcontext = MemoryContextSwitchTo(outercontext); + tree = findsubquery(tree, qex, qsubs, NULL); + MemoryContextSwitchTo(oldcontext); + + QTNFree(qex); + if (qtex != (TSQuery) DatumGetPointer(qdata)) + pfree(qtex); + QTNFree(qsubs); + if (qtsubs != (TSQuery) DatumGetPointer(sdata)) + pfree(qtsubs); + } + } + + SPI_freetuptable(SPI_tuptable); + SPI_cursor_fetch(portal, true, 100); + } + + SPI_freetuptable(SPI_tuptable); + SPI_cursor_close(portal); + SPI_freeplan(plan); + SPI_finish(); + + if (tree) + { + QTNBinary(tree); + rewrited = QTN2QT(tree); + QTNFree(tree); + PG_FREE_IF_COPY(query, 0); + } + else + { + SET_VARSIZE(rewrited, HDRSIZETQ); + rewrited->size = 0; + } + + pfree(buf); + PG_FREE_IF_COPY(in, 1); + PG_RETURN_POINTER(rewrited); +} + +Datum +tsquery_rewrite_query(PG_FUNCTION_ARGS) +{ + TSQuery query = PG_GETARG_TSQUERY_COPY(0); + TSQuery ex = PG_GETARG_TSQUERY(1); + TSQuery subst = PG_GETARG_TSQUERY(2); + TSQuery rewrited = query; + QTNode *tree, + *qex, + *subs = NULL; + + if (query->size == 0 || ex->size == 0) + { + PG_FREE_IF_COPY(ex, 1); + PG_FREE_IF_COPY(subst, 2); + PG_RETURN_POINTER(rewrited); + } + + tree = QT2QTN(GETQUERY(query), GETOPERAND(query)); + QTNTernary(tree); + QTNSort(tree); + + qex = QT2QTN(GETQUERY(ex), GETOPERAND(ex)); + QTNTernary(qex); + QTNSort(qex); + + if (subst->size) + subs = QT2QTN(GETQUERY(subst), GETOPERAND(subst)); + + tree = findsubquery(tree, qex, subs, NULL); + QTNFree(qex); + QTNFree(subs); + + if (!tree) + { + SET_VARSIZE(rewrited, HDRSIZETQ); + rewrited->size = 0; + PG_FREE_IF_COPY(ex, 1); + PG_FREE_IF_COPY(subst, 2); + PG_RETURN_POINTER(rewrited); + } + else + { + QTNBinary(tree); + rewrited = QTN2QT(tree); + QTNFree(tree); + } + + PG_FREE_IF_COPY(query, 0); + PG_FREE_IF_COPY(ex, 1); + PG_FREE_IF_COPY(subst, 2); + PG_RETURN_POINTER(rewrited); +} diff --git a/src/backend/utils/adt/tsquery_util.c b/src/backend/utils/adt/tsquery_util.c new file mode 100644 index 00000000000..ae8cc318da9 --- /dev/null +++ b/src/backend/utils/adt/tsquery_util.c @@ -0,0 +1,317 @@ +/*------------------------------------------------------------------------- + * + * tsquery_util.c + * Utilities for tsquery datatype + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_util.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" + + +QTNode * +QT2QTN(QueryItem * in, char *operand) +{ + QTNode *node = (QTNode *) palloc0(sizeof(QTNode)); + + node->valnode = in; + + if (in->type == OPR) + { + node->child = (QTNode **) palloc0(sizeof(QTNode *) * 2); + node->child[0] = QT2QTN(in + 1, operand); + node->sign = node->child[0]->sign; + if (in->val == (int4) '!') + node->nchild = 1; + else + { + node->nchild = 2; + node->child[1] = QT2QTN(in + in->left, operand); + node->sign |= node->child[1]->sign; + } + } + else if (operand) + { + node->word = operand + in->distance; + node->sign = 1 << (in->val % 32); + } + + return node; +} + +void +QTNFree(QTNode * in) +{ + if (!in) + return; + + if (in->valnode->type == VAL && in->word && (in->flags & QTN_WORDFREE) != 0) + pfree(in->word); + + if (in->child) + { + if (in->valnode) + { + if (in->valnode->type == OPR && in->nchild > 0) + { + int i; + + for (i = 0; i < in->nchild; i++) + QTNFree(in->child[i]); + } + if (in->flags & QTN_NEEDFREE) + pfree(in->valnode); + } + pfree(in->child); + } + + pfree(in); +} + +int +QTNodeCompare(QTNode * an, QTNode * bn) +{ + if (an->valnode->type != bn->valnode->type) + return (an->valnode->type > bn->valnode->type) ? -1 : 1; + else if (an->valnode->val != bn->valnode->val) + return (an->valnode->val > bn->valnode->val) ? -1 : 1; + else if (an->valnode->type == VAL) + { + if (an->valnode->length == bn->valnode->length) + return strncmp(an->word, bn->word, an->valnode->length); + else + return (an->valnode->length > bn->valnode->length) ? -1 : 1; + } + else if (an->nchild != bn->nchild) + { + return (an->nchild > bn->nchild) ? -1 : 1; + } + else + { + int i, + res; + + for (i = 0; i < an->nchild; i++) + if ((res = QTNodeCompare(an->child[i], bn->child[i])) != 0) + return res; + } + + return 0; +} + +static int +cmpQTN(const void *a, const void *b) +{ + return QTNodeCompare(*(QTNode **) a, *(QTNode **) b); +} + +void +QTNSort(QTNode * in) +{ + int i; + + if (in->valnode->type != OPR) + return; + + for (i = 0; i < in->nchild; i++) + QTNSort(in->child[i]); + if (in->nchild > 1) + qsort((void *) in->child, in->nchild, sizeof(QTNode *), cmpQTN); +} + +bool +QTNEq(QTNode * a, QTNode * b) +{ + uint32 sign = a->sign & b->sign; + + if (!(sign == a->sign && sign == b->sign)) + return 0; + + return (QTNodeCompare(a, b) == 0) ? true : false; +} + +void +QTNTernary(QTNode * in) +{ + int i; + + if (in->valnode->type != OPR) + return; + + for (i = 0; i < in->nchild; i++) + QTNTernary(in->child[i]); + + for (i = 0; i < in->nchild; i++) + { + if (in->valnode->type == in->child[i]->valnode->type && in->valnode->val == in->child[i]->valnode->val) + { + QTNode *cc = in->child[i]; + int oldnchild = in->nchild; + + in->nchild += cc->nchild - 1; + in->child = (QTNode **) repalloc(in->child, in->nchild * sizeof(QTNode *)); + + if (i + 1 != oldnchild) + memmove(in->child + i + cc->nchild, in->child + i + 1, + (oldnchild - i - 1) * sizeof(QTNode *)); + + memcpy(in->child + i, cc->child, cc->nchild * sizeof(QTNode *)); + i += cc->nchild - 1; + + pfree(cc); + } + } +} + +void +QTNBinary(QTNode * in) +{ + int i; + + if (in->valnode->type != OPR) + return; + + for (i = 0; i < in->nchild; i++) + QTNBinary(in->child[i]); + + if (in->nchild <= 2) + return; + + while (in->nchild > 2) + { + QTNode *nn = (QTNode *) palloc0(sizeof(QTNode)); + + nn->valnode = (QueryItem *) palloc0(sizeof(QueryItem)); + nn->child = (QTNode **) palloc0(sizeof(QTNode *) * 2); + + nn->nchild = 2; + nn->flags = QTN_NEEDFREE; + + nn->child[0] = in->child[0]; + nn->child[1] = in->child[1]; + nn->sign = nn->child[0]->sign | nn->child[1]->sign; + + nn->valnode->type = in->valnode->type; + nn->valnode->val = in->valnode->val; + + in->child[0] = nn; + in->child[1] = in->child[in->nchild - 1]; + in->nchild--; + } +} + +static void +cntsize(QTNode * in, int4 *sumlen, int4 *nnode) +{ + *nnode += 1; + if (in->valnode->type == OPR) + { + int i; + + for (i = 0; i < in->nchild; i++) + cntsize(in->child[i], sumlen, nnode); + } + else + { + *sumlen += in->valnode->length + 1; + } +} + +typedef struct +{ + QueryItem *curitem; + char *operand; + char *curoperand; +} QTN2QTState; + +static void +fillQT(QTN2QTState * state, QTNode * in) +{ + *(state->curitem) = *(in->valnode); + + if (in->valnode->type == VAL) + { + memcpy(state->curoperand, in->word, in->valnode->length); + state->curitem->distance = state->curoperand - state->operand; + state->curoperand[in->valnode->length] = '\0'; + state->curoperand += in->valnode->length + 1; + state->curitem++; + } + else + { + QueryItem *curitem = state->curitem; + + Assert(in->nchild <= 2); + state->curitem++; + + fillQT(state, in->child[0]); + + if (in->nchild == 2) + { + curitem->left = state->curitem - curitem; + fillQT(state, in->child[1]); + } + } +} + +TSQuery +QTN2QT(QTNode *in) +{ + TSQuery out; + int len; + int sumlen = 0, + nnode = 0; + QTN2QTState state; + + cntsize(in, &sumlen, &nnode); + len = COMPUTESIZE(nnode, sumlen); + + out = (TSQuery) palloc(len); + SET_VARSIZE(out, len); + out->size = nnode; + + state.curitem = GETQUERY(out); + state.operand = state.curoperand = GETOPERAND(out); + + fillQT(&state, in); + return out; +} + +QTNode * +QTNCopy(QTNode *in) +{ + QTNode *out = (QTNode *) palloc(sizeof(QTNode)); + + *out = *in; + out->valnode = (QueryItem *) palloc(sizeof(QueryItem)); + *(out->valnode) = *(in->valnode); + out->flags |= QTN_NEEDFREE; + + if (in->valnode->type == VAL) + { + out->word = palloc(in->valnode->length + 1); + memcpy(out->word, in->word, in->valnode->length); + out->word[in->valnode->length] = '\0'; + out->flags |= QTN_WORDFREE; + } + else + { + int i; + + out->child = (QTNode **) palloc(sizeof(QTNode *) * in->nchild); + + for (i = 0; i < in->nchild; i++) + out->child[i] = QTNCopy(in->child[i]); + } + + return out; +} diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c new file mode 100644 index 00000000000..8b2ab884c8c --- /dev/null +++ b/src/backend/utils/adt/tsrank.c @@ -0,0 +1,804 @@ +/*------------------------------------------------------------------------- + * + * tsrank.c + * rank tsvector by tsquery + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <math.h> + +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" +#include "utils/array.h" + + +static float weights[] = {0.1, 0.2, 0.4, 1.0}; + +#define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] ) + +#define RANK_NO_NORM 0x00 +#define RANK_NORM_LOGLENGTH 0x01 +#define RANK_NORM_LENGTH 0x02 +#define RANK_NORM_EXTDIST 0x04 +#define RANK_NORM_UNIQ 0x08 +#define RANK_NORM_LOGUNIQ 0x10 +#define DEF_NORM_METHOD RANK_NO_NORM + +static float calc_rank_or(float *w, TSVector t, TSQuery q); +static float calc_rank_and(float *w, TSVector t, TSQuery q); + +/* + * Returns a weight of a word collocation + */ +static float4 +word_distance(int4 w) +{ + if (w > 100) + return 1e-30; + + return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2)); +} + +static int +cnt_length(TSVector t) +{ + WordEntry *ptr = ARRPTR(t), + *end = (WordEntry *) STRPTR(t); + int len = 0, + clen; + + while (ptr < end) + { + if ((clen = POSDATALEN(t, ptr)) == 0) + len += 1; + else + len += clen; + ptr++; + } + + return len; +} + +static int4 +WordECompareQueryItem(char *eval, char *qval, WordEntry * ptr, QueryItem * item) +{ + if (ptr->len == item->length) + return strncmp( + eval + ptr->pos, + qval + item->distance, + item->length); + + return (ptr->len > item->length) ? 1 : -1; +} + +static WordEntry * +find_wordentry(TSVector t, TSQuery q, QueryItem * item) +{ + WordEntry *StopLow = ARRPTR(t); + WordEntry *StopHigh = (WordEntry *) STRPTR(t); + WordEntry *StopMiddle; + int difference; + + /* Loop invariant: StopLow <= item < StopHigh */ + + while (StopLow < StopHigh) + { + StopMiddle = StopLow + (StopHigh - StopLow) / 2; + difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item); + if (difference == 0) + return StopMiddle; + else if (difference < 0) + StopLow = StopMiddle + 1; + else + StopHigh = StopMiddle; + } + + return NULL; +} + + +static int +compareQueryItem(const void *a, const void *b, void *arg) +{ + char *operand = (char *) arg; + + if ((*(QueryItem **) a)->length == (*(QueryItem **) b)->length) + return strncmp(operand + (*(QueryItem **) a)->distance, + operand + (*(QueryItem **) b)->distance, + (*(QueryItem **) b)->length); + + return ((*(QueryItem **) a)->length > (*(QueryItem **) b)->length) ? 1 : -1; +} + +static QueryItem ** +SortAndUniqItems(char *operand, QueryItem * item, int *size) +{ + QueryItem **res, + **ptr, + **prevptr; + + ptr = res = (QueryItem **) palloc(sizeof(QueryItem *) * *size); + + while ((*size)--) + { + if (item->type == VAL) + { + *ptr = item; + ptr++; + } + item++; + } + + *size = ptr - res; + if (*size < 2) + return res; + + qsort_arg(res, *size, sizeof(QueryItem **), compareQueryItem, (void *) operand); + + ptr = res + 1; + prevptr = res; + + while (ptr - res < *size) + { + if (compareQueryItem((void *) ptr, (void *) prevptr, (void *) operand) != 0) + { + prevptr++; + *prevptr = *ptr; + } + ptr++; + } + + *size = prevptr + 1 - res; + return res; +} + +static WordEntryPos POSNULL[] = { + 0, + 0 +}; + +static float +calc_rank_and(float *w, TSVector t, TSQuery q) +{ + uint16 **pos; + int i, + k, + l, + p; + WordEntry *entry; + WordEntryPos *post, + *ct; + int4 dimt, + lenct, + dist; + float res = -1.0; + QueryItem **item; + int size = q->size; + + item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size); + if (size < 2) + { + pfree(item); + return calc_rank_or(w, t, q); + } + pos = (uint16 **) palloc(sizeof(uint16 *) * q->size); + memset(pos, 0, sizeof(uint16 *) * q->size); + *(uint16 *) POSNULL = lengthof(POSNULL) - 1; + WEP_SETPOS(POSNULL[1], MAXENTRYPOS - 1); + + for (i = 0; i < size; i++) + { + entry = find_wordentry(t, q, item[i]); + if (!entry) + continue; + + if (entry->haspos) + pos[i] = (uint16 *) _POSDATAPTR(t, entry); + else + pos[i] = (uint16 *) POSNULL; + + + dimt = *(uint16 *) (pos[i]); + post = (WordEntryPos *) (pos[i] + 1); + for (k = 0; k < i; k++) + { + if (!pos[k]) + continue; + lenct = *(uint16 *) (pos[k]); + ct = (WordEntryPos *) (pos[k] + 1); + for (l = 0; l < dimt; l++) + { + for (p = 0; p < lenct; p++) + { + dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p])); + if (dist || (dist == 0 && (pos[i] == (uint16 *) POSNULL || pos[k] == (uint16 *) POSNULL))) + { + float curw; + + if (!dist) + dist = MAXENTRYPOS; + curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist)); + res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw); + } + } + } + } + } + pfree(pos); + pfree(item); + return res; +} + +static float +calc_rank_or(float *w, TSVector t, TSQuery q) +{ + WordEntry *entry; + WordEntryPos *post; + int4 dimt, + j, + i; + float res = 0.0; + QueryItem **item; + int size = q->size; + + *(uint16 *) POSNULL = lengthof(POSNULL) - 1; + item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size); + + for (i = 0; i < size; i++) + { + float resj, + wjm; + int4 jm; + + entry = find_wordentry(t, q, item[i]); + if (!entry) + continue; + + if (entry->haspos) + { + dimt = POSDATALEN(t, entry); + post = POSDATAPTR(t, entry); + } + else + { + dimt = *(uint16 *) POSNULL; + post = POSNULL + 1; + } + + resj = 0.0; + wjm = -1.0; + jm = 0; + for (j = 0; j < dimt; j++) + { + resj = resj + wpos(post[j]) / ((j + 1) * (j + 1)); + if (wpos(post[j]) > wjm) + { + wjm = wpos(post[j]); + jm = j; + } + } +/* + limit (sum(i/i^2),i->inf) = pi^2/6 + resj = sum(wi/i^2),i=1,noccurence, + wi - should be sorted desc, + don't sort for now, just choose maximum weight. This should be corrected + Oleg Bartunov +*/ + res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685; + } + if (size > 0) + res = res / size; + pfree(item); + return res; +} + +static float +calc_rank(float *w, TSVector t, TSQuery q, int4 method) +{ + QueryItem *item = GETQUERY(q); + float res = 0.0; + int len; + + if (!t->size || !q->size) + return 0.0; + + res = (item->type != VAL && item->val == (int4) '&') ? + calc_rank_and(w, t, q) : calc_rank_or(w, t, q); + + if (res < 0) + res = 1e-20; + + if ((method & RANK_NORM_LOGLENGTH) && t->size > 0) + res /= log((double) (cnt_length(t) + 1)) / log(2.0); + + if (method & RANK_NORM_LENGTH) + { + len = cnt_length(t); + if (len > 0) + res /= (float) len; + } + + if ((method & RANK_NORM_UNIQ) && t->size > 0) + res /= (float) (t->size); + + if ((method & RANK_NORM_LOGUNIQ) && t->size > 0) + res /= log((double) (t->size + 1)) / log(2.0); + + return res; +} + +static float * +getWeights(ArrayType *win) +{ + static float ws[lengthof(weights)]; + int i; + float4 *arrdata; + + if (win == 0) + return weights; + + if (ARR_NDIM(win) != 1) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("array of weight must be one-dimensional"))); + + if (ArrayGetNItems(ARR_NDIM(win), ARR_DIMS(win)) < lengthof(weights)) + ereport(ERROR, + (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), + errmsg("array of weight is too short"))); + + if (ARR_HASNULL(win)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("array of weight must not contain nulls"))); + + arrdata = (float4 *) ARR_DATA_PTR(win); + for (i = 0; i < lengthof(weights); i++) + { + ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i]; + if (ws[i] > 1.0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("weight out of range"))); + } + + return ws; +} + +Datum +ts_rank_wttf(PG_FUNCTION_ARGS) +{ + ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); + TSVector txt = PG_GETARG_TSVECTOR(1); + TSQuery query = PG_GETARG_TSQUERY(2); + int method = PG_GETARG_INT32(3); + float res; + + res = calc_rank(getWeights(win), txt, query, method); + + PG_FREE_IF_COPY(win, 0); + PG_FREE_IF_COPY(txt, 1); + PG_FREE_IF_COPY(query, 2); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rank_wtt(PG_FUNCTION_ARGS) +{ + ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); + TSVector txt = PG_GETARG_TSVECTOR(1); + TSQuery query = PG_GETARG_TSQUERY(2); + float res; + + res = calc_rank(getWeights(win), txt, query, DEF_NORM_METHOD); + + PG_FREE_IF_COPY(win, 0); + PG_FREE_IF_COPY(txt, 1); + PG_FREE_IF_COPY(query, 2); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rank_ttf(PG_FUNCTION_ARGS) +{ + TSVector txt = PG_GETARG_TSVECTOR(0); + TSQuery query = PG_GETARG_TSQUERY(1); + int method = PG_GETARG_INT32(2); + float res; + + res = calc_rank(getWeights(NULL), txt, query, method); + + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rank_tt(PG_FUNCTION_ARGS) +{ + TSVector txt = PG_GETARG_TSVECTOR(0); + TSQuery query = PG_GETARG_TSQUERY(1); + float res; + + res = calc_rank(getWeights(NULL), txt, query, DEF_NORM_METHOD); + + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_FLOAT4(res); +} + +typedef struct +{ + QueryItem **item; + int16 nitem; + bool needfree; + uint8 wclass; + int32 pos; +} DocRepresentation; + +static int +compareDocR(const void *a, const void *b) +{ + if (((DocRepresentation *) a)->pos == ((DocRepresentation *) b)->pos) + return 0; + return (((DocRepresentation *) a)->pos > ((DocRepresentation *) b)->pos) ? 1 : -1; +} + +static bool +checkcondition_QueryItem(void *checkval, QueryItem * val) +{ + return (bool) (val->istrue); +} + +static void +reset_istrue_flag(TSQuery query) +{ + QueryItem *item = GETQUERY(query); + int i; + + /* reset istrue flag */ + for (i = 0; i < query->size; i++) + { + if (item->type == VAL) + item->istrue = 0; + item++; + } +} + +typedef struct +{ + int pos; + int p; + int q; + DocRepresentation *begin; + DocRepresentation *end; +} Extention; + + +static bool +Cover(DocRepresentation * doc, int len, TSQuery query, Extention * ext) +{ + DocRepresentation *ptr; + int lastpos = ext->pos; + int i; + bool found = false; + + reset_istrue_flag(query); + + ext->p = 0x7fffffff; + ext->q = 0; + ptr = doc + ext->pos; + + /* find upper bound of cover from current position, move up */ + while (ptr - doc < len) + { + for (i = 0; i < ptr->nitem; i++) + ptr->item[i]->istrue = 1; + if (TS_execute(GETQUERY(query), NULL, false, checkcondition_QueryItem)) + { + if (ptr->pos > ext->q) + { + ext->q = ptr->pos; + ext->end = ptr; + lastpos = ptr - doc; + found = true; + } + break; + } + ptr++; + } + + if (!found) + return false; + + reset_istrue_flag(query); + + ptr = doc + lastpos; + + /* find lower bound of cover from founded upper bound, move down */ + while (ptr >= doc + ext->pos) + { + for (i = 0; i < ptr->nitem; i++) + ptr->item[i]->istrue = 1; + if (TS_execute(GETQUERY(query), NULL, true, checkcondition_QueryItem)) + { + if (ptr->pos < ext->p) + { + ext->begin = ptr; + ext->p = ptr->pos; + } + break; + } + ptr--; + } + + if (ext->p <= ext->q) + { + /* + * set position for next try to next lexeme after begining of founded + * cover + */ + ext->pos = (ptr - doc) + 1; + return true; + } + + ext->pos++; + return Cover(doc, len, query, ext); +} + +static DocRepresentation * +get_docrep(TSVector txt, TSQuery query, int *doclen) +{ + QueryItem *item = GETQUERY(query); + WordEntry *entry; + WordEntryPos *post; + int4 dimt, + j, + i; + int len = query->size * 4, + cur = 0; + DocRepresentation *doc; + char *operand; + + *(uint16 *) POSNULL = lengthof(POSNULL) - 1; + doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len); + operand = GETOPERAND(query); + reset_istrue_flag(query); + + for (i = 0; i < query->size; i++) + { + if (item[i].type != VAL || item[i].istrue) + continue; + + entry = find_wordentry(txt, query, &(item[i])); + if (!entry) + continue; + + if (entry->haspos) + { + dimt = POSDATALEN(txt, entry); + post = POSDATAPTR(txt, entry); + } + else + { + dimt = *(uint16 *) POSNULL; + post = POSNULL + 1; + } + + while (cur + dimt >= len) + { + len *= 2; + doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len); + } + + for (j = 0; j < dimt; j++) + { + if (j == 0) + { + QueryItem *kptr, + *iptr = item + i; + int k; + + doc[cur].needfree = false; + doc[cur].nitem = 0; + doc[cur].item = (QueryItem **) palloc(sizeof(QueryItem *) * query->size); + + for (k = 0; k < query->size; k++) + { + kptr = item + k; + if (k == i || + (item[k].type == VAL && + compareQueryItem(&kptr, &iptr, operand) == 0)) + { + doc[cur].item[doc[cur].nitem] = item + k; + doc[cur].nitem++; + kptr->istrue = 1; + } + } + } + else + { + doc[cur].needfree = false; + doc[cur].nitem = doc[cur - 1].nitem; + doc[cur].item = doc[cur - 1].item; + } + doc[cur].pos = WEP_GETPOS(post[j]); + doc[cur].wclass = WEP_GETWEIGHT(post[j]); + cur++; + } + } + + *doclen = cur; + + if (cur > 0) + { + if (cur > 1) + qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR); + return doc; + } + + pfree(doc); + return NULL; +} + +static float4 +calc_rank_cd(float4 *arrdata, TSVector txt, TSQuery query, int method) +{ + DocRepresentation *doc; + int len, + i, + doclen = 0; + Extention ext; + double Wdoc = 0.0; + double invws[lengthof(weights)]; + double SumDist = 0.0, + PrevExtPos = 0.0, + CurExtPos = 0.0; + int NExtent = 0; + + for (i = 0; i < lengthof(weights); i++) + { + invws[i] = ((double) ((arrdata[i] >= 0) ? arrdata[i] : weights[i])); + if (invws[i] > 1.0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("weight out of range"))); + invws[i] = 1.0 / invws[i]; + } + + doc = get_docrep(txt, query, &doclen); + if (!doc) + return 0.0; + + MemSet(&ext, 0, sizeof(Extention)); + while (Cover(doc, doclen, query, &ext)) + { + double Cpos = 0.0; + double InvSum = 0.0; + int nNoise; + DocRepresentation *ptr = ext.begin; + + while (ptr <= ext.end) + { + InvSum += invws[ptr->wclass]; + ptr++; + } + + Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum; + + /* + * if doc are big enough then ext.q may be equal to ext.p due to limit + * of posional information. In this case we approximate number of + * noise word as half cover's length + */ + nNoise = (ext.q - ext.p) - (ext.end - ext.begin); + if (nNoise < 0) + nNoise = (ext.end - ext.begin) / 2; + Wdoc += Cpos / ((double) (1 + nNoise)); + + CurExtPos = ((double) (ext.q + ext.p)) / 2.0; + if (NExtent > 0 && CurExtPos > PrevExtPos /* prevent devision by + * zero in a case of + multiple lexize */ ) + SumDist += 1.0 / (CurExtPos - PrevExtPos); + + PrevExtPos = CurExtPos; + NExtent++; + } + + if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0) + Wdoc /= log((double) (cnt_length(txt) + 1)); + + if (method & RANK_NORM_LENGTH) + { + len = cnt_length(txt); + if (len > 0) + Wdoc /= (double) len; + } + + if ((method & RANK_NORM_EXTDIST) && SumDist > 0) + Wdoc /= ((double) NExtent) / SumDist; + + if ((method & RANK_NORM_UNIQ) && txt->size > 0) + Wdoc /= (double) (txt->size); + + if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0) + Wdoc /= log((double) (txt->size + 1)) / log(2.0); + + for (i = 0; i < doclen; i++) + if (doc[i].needfree) + pfree(doc[i].item); + pfree(doc); + + return (float4) Wdoc; +} + +Datum +ts_rankcd_wttf(PG_FUNCTION_ARGS) +{ + ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); + TSVector txt = PG_GETARG_TSVECTOR(1); + TSQuery query = PG_GETARG_TSQUERY_COPY(2); + int method = PG_GETARG_INT32(3); + float res; + + res = calc_rank_cd(getWeights(win), txt, query, method); + + PG_FREE_IF_COPY(win, 0); + PG_FREE_IF_COPY(txt, 1); + PG_FREE_IF_COPY(query, 2); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rankcd_wtt(PG_FUNCTION_ARGS) +{ + ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); + TSVector txt = PG_GETARG_TSVECTOR(1); + TSQuery query = PG_GETARG_TSQUERY_COPY(2); + float res; + + res = calc_rank_cd(getWeights(win), txt, query, DEF_NORM_METHOD); + + PG_FREE_IF_COPY(win, 0); + PG_FREE_IF_COPY(txt, 1); + PG_FREE_IF_COPY(query, 2); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rankcd_ttf(PG_FUNCTION_ARGS) +{ + TSVector txt = PG_GETARG_TSVECTOR(0); + TSQuery query = PG_GETARG_TSQUERY_COPY(1); + int method = PG_GETARG_INT32(2); + float res; + + res = calc_rank_cd(getWeights(NULL), txt, query, method); + + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_FLOAT4(res); +} + +Datum +ts_rankcd_tt(PG_FUNCTION_ARGS) +{ + TSVector txt = PG_GETARG_TSVECTOR(0); + TSQuery query = PG_GETARG_TSQUERY_COPY(1); + float res; + + res = calc_rank_cd(getWeights(NULL), txt, query, DEF_NORM_METHOD); + + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_FLOAT4(res); +} diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c new file mode 100644 index 00000000000..04b6345e162 --- /dev/null +++ b/src/backend/utils/adt/tsvector.c @@ -0,0 +1,683 @@ +/*------------------------------------------------------------------------- + * + * tsvector.c + * I/O functions for tsvector + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "libpq/pqformat.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_locale.h" +#include "tsearch/ts_utils.h" +#include "utils/memutils.h" + + +static int +comparePos(const void *a, const void *b) +{ + if (WEP_GETPOS(*(WordEntryPos *) a) == WEP_GETPOS(*(WordEntryPos *) b)) + return 0; + return (WEP_GETPOS(*(WordEntryPos *) a) > WEP_GETPOS(*(WordEntryPos *) b)) ? 1 : -1; +} + +static int +uniquePos(WordEntryPos * a, int4 l) +{ + WordEntryPos *ptr, + *res; + + if (l == 1) + return l; + + res = a; + qsort((void *) a, l, sizeof(WordEntryPos), comparePos); + + ptr = a + 1; + while (ptr - a < l) + { + if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res)) + { + res++; + *res = *ptr; + if (res - a >= MAXNUMPOS - 1 || WEP_GETPOS(*res) == MAXENTRYPOS - 1) + break; + } + else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res)) + WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr)); + ptr++; + } + + return res + 1 - a; +} + +static int +compareentry(const void *a, const void *b, void *arg) +{ + char *BufferStr = (char *) arg; + + if (((WordEntryIN *) a)->entry.len == ((WordEntryIN *) b)->entry.len) + { + return strncmp(&BufferStr[((WordEntryIN *) a)->entry.pos], + &BufferStr[((WordEntryIN *) b)->entry.pos], + ((WordEntryIN *) a)->entry.len); + } + + return (((WordEntryIN *) a)->entry.len > ((WordEntryIN *) b)->entry.len) ? 1 : -1; +} + +static int +uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen) +{ + WordEntryIN *ptr, + *res; + + res = a; + if (l == 1) + { + if (a->entry.haspos) + { + *(uint16 *) (a->pos) = uniquePos(&(a->pos[1]), *(uint16 *) (a->pos)); + *outbuflen = SHORTALIGN(res->entry.len) + (*(uint16 *) (a->pos) + 1) * sizeof(WordEntryPos); + } + return l; + } + + ptr = a + 1; + qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, (void *) buf); + + while (ptr - a < l) + { + if (!(ptr->entry.len == res->entry.len && + strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], res->entry.len) == 0)) + { + if (res->entry.haspos) + { + *(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos)); + *outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos); + } + *outbuflen += SHORTALIGN(res->entry.len); + res++; + memcpy(res, ptr, sizeof(WordEntryIN)); + } + else if (ptr->entry.haspos) + { + if (res->entry.haspos) + { + int4 len = *(uint16 *) (ptr->pos) + 1 + *(uint16 *) (res->pos); + + res->pos = (WordEntryPos *) repalloc(res->pos, len * sizeof(WordEntryPos)); + memcpy(&(res->pos[*(uint16 *) (res->pos) + 1]), + &(ptr->pos[1]), *(uint16 *) (ptr->pos) * sizeof(WordEntryPos)); + *(uint16 *) (res->pos) += *(uint16 *) (ptr->pos); + pfree(ptr->pos); + } + else + { + res->entry.haspos = 1; + res->pos = ptr->pos; + } + } + ptr++; + } + if (res->entry.haspos) + { + *(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos)); + *outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos); + } + *outbuflen += SHORTALIGN(res->entry.len); + + return res + 1 - a; +} + +static int +WordEntryCMP(WordEntry * a, WordEntry * b, char *buf) +{ + return compareentry(a, b, buf); +} + +#define WAITWORD 1 +#define WAITENDWORD 2 +#define WAITNEXTCHAR 3 +#define WAITENDCMPLX 4 +#define WAITPOSINFO 5 +#define INPOSINFO 6 +#define WAITPOSDELIM 7 +#define WAITCHARCMPLX 8 + +#define RESIZEPRSBUF \ +do { \ + if ( state->curpos - state->word + pg_database_encoding_max_length() >= state->len ) \ + { \ + int4 clen = state->curpos - state->word; \ + state->len *= 2; \ + state->word = (char*)repalloc( (void*)state->word, state->len ); \ + state->curpos = state->word + clen; \ + } \ +} while (0) + +bool +gettoken_tsvector(TSVectorParseState *state) +{ + int4 oldstate = 0; + + state->curpos = state->word; + state->state = WAITWORD; + state->alen = 0; + + while (1) + { + if (state->state == WAITWORD) + { + if (*(state->prsbuf) == '\0') + return false; + else if (t_iseq(state->prsbuf, '\'')) + state->state = WAITENDCMPLX; + else if (t_iseq(state->prsbuf, '\\')) + { + state->state = WAITNEXTCHAR; + oldstate = WAITENDWORD; + } + else if (state->oprisdelim && ISOPERATOR(state->prsbuf)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + else if (!t_isspace(state->prsbuf)) + { + COPYCHAR(state->curpos, state->prsbuf); + state->curpos += pg_mblen(state->prsbuf); + state->state = WAITENDWORD; + } + } + else if (state->state == WAITNEXTCHAR) + { + if (*(state->prsbuf) == '\0') + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("there is no escaped character"))); + else + { + RESIZEPRSBUF; + COPYCHAR(state->curpos, state->prsbuf); + state->curpos += pg_mblen(state->prsbuf); + state->state = oldstate; + } + } + else if (state->state == WAITENDWORD) + { + if (t_iseq(state->prsbuf, '\\')) + { + state->state = WAITNEXTCHAR; + oldstate = WAITENDWORD; + } + else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' || + (state->oprisdelim && ISOPERATOR(state->prsbuf))) + { + RESIZEPRSBUF; + if (state->curpos == state->word) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + *(state->curpos) = '\0'; + return true; + } + else if (t_iseq(state->prsbuf, ':')) + { + if (state->curpos == state->word) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + *(state->curpos) = '\0'; + if (state->oprisdelim) + return true; + else + state->state = INPOSINFO; + } + else + { + RESIZEPRSBUF; + COPYCHAR(state->curpos, state->prsbuf); + state->curpos += pg_mblen(state->prsbuf); + } + } + else if (state->state == WAITENDCMPLX) + { + if (t_iseq(state->prsbuf, '\'')) + { + state->state = WAITCHARCMPLX; + } + else if (t_iseq(state->prsbuf, '\\')) + { + state->state = WAITNEXTCHAR; + oldstate = WAITENDCMPLX; + } + else if (*(state->prsbuf) == '\0') + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + else + { + RESIZEPRSBUF; + COPYCHAR(state->curpos, state->prsbuf); + state->curpos += pg_mblen(state->prsbuf); + } + } + else if (state->state == WAITCHARCMPLX) + { + if (t_iseq(state->prsbuf, '\'')) + { + RESIZEPRSBUF; + COPYCHAR(state->curpos, state->prsbuf); + state->curpos += pg_mblen(state->prsbuf); + state->state = WAITENDCMPLX; + } + else + { + RESIZEPRSBUF; + *(state->curpos) = '\0'; + if (state->curpos == state->word) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + if (state->oprisdelim) + { + /* state->prsbuf+=pg_mblen(state->prsbuf); */ + return true; + } + else + state->state = WAITPOSINFO; + continue; /* recheck current character */ + } + } + else if (state->state == WAITPOSINFO) + { + if (t_iseq(state->prsbuf, ':')) + state->state = INPOSINFO; + else + return true; + } + else if (state->state == INPOSINFO) + { + if (t_isdigit(state->prsbuf)) + { + if (state->alen == 0) + { + state->alen = 4; + state->pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * state->alen); + *(uint16 *) (state->pos) = 0; + } + else if (*(uint16 *) (state->pos) + 1 >= state->alen) + { + state->alen *= 2; + state->pos = (WordEntryPos *) repalloc(state->pos, sizeof(WordEntryPos) * state->alen); + } + (*(uint16 *) (state->pos))++; + WEP_SETPOS(state->pos[*(uint16 *) (state->pos)], LIMITPOS(atoi(state->prsbuf))); + if (WEP_GETPOS(state->pos[*(uint16 *) (state->pos)]) == 0) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("wrong position info in tsvector"))); + WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0); + state->state = WAITPOSDELIM; + } + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + } + else if (state->state == WAITPOSDELIM) + { + if (t_iseq(state->prsbuf, ',')) + state->state = INPOSINFO; + else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*')) + { + if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 3); + } + else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B')) + { + if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 2); + } + else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C')) + { + if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 1); + } + else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D')) + { + if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0); + } + else if (t_isspace(state->prsbuf) || + *(state->prsbuf) == '\0') + return true; + else if (!t_isdigit(state->prsbuf)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("syntax error in tsvector"))); + } + else /* internal error */ + elog(ERROR, "internal error in gettoken_tsvector"); + + /* get next char */ + state->prsbuf += pg_mblen(state->prsbuf); + } + + return false; +} + +Datum +tsvectorin(PG_FUNCTION_ARGS) +{ + char *buf = PG_GETARG_CSTRING(0); + TSVectorParseState state; + WordEntryIN *arr; + WordEntry *inarr; + int4 len = 0, + totallen = 64; + TSVector in; + char *tmpbuf, + *cur; + int4 i, + buflen = 256; + + pg_verifymbstr(buf, strlen(buf), false); + state.prsbuf = buf; + state.len = 32; + state.word = (char *) palloc(state.len); + state.oprisdelim = false; + + arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * totallen); + cur = tmpbuf = (char *) palloc(buflen); + + while (gettoken_tsvector(&state)) + { + /* + * Realloc buffers if it's needed + */ + if (len >= totallen) + { + totallen *= 2; + arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * totallen); + } + + while ((cur - tmpbuf) + (state.curpos - state.word) >= buflen) + { + int4 dist = cur - tmpbuf; + + buflen *= 2; + tmpbuf = (char *) repalloc((void *) tmpbuf, buflen); + cur = tmpbuf + dist; + } + + if (state.curpos - state.word >= MAXSTRLEN) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("word is too long (%d bytes, max %d bytes)", + state.curpos - state.word, MAXSTRLEN))); + + arr[len].entry.len = state.curpos - state.word; + if (cur - tmpbuf > MAXSTRPOS) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("position value too large"))); + arr[len].entry.pos = cur - tmpbuf; + memcpy((void *) cur, (void *) state.word, arr[len].entry.len); + cur += arr[len].entry.len; + + if (state.alen) + { + arr[len].entry.haspos = 1; + arr[len].pos = state.pos; + } + else + arr[len].entry.haspos = 0; + len++; + } + pfree(state.word); + + if (len > 0) + len = uniqueentry(arr, len, tmpbuf, &buflen); + else + buflen = 0; + totallen = CALCDATASIZE(len, buflen); + in = (TSVector) palloc0(totallen); + + SET_VARSIZE(in, totallen); + in->size = len; + cur = STRPTR(in); + inarr = ARRPTR(in); + for (i = 0; i < len; i++) + { + memcpy((void *) cur, (void *) &tmpbuf[arr[i].entry.pos], arr[i].entry.len); + arr[i].entry.pos = cur - STRPTR(in); + cur += SHORTALIGN(arr[i].entry.len); + if (arr[i].entry.haspos) + { + memcpy(cur, arr[i].pos, (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos)); + cur += (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos); + pfree(arr[i].pos); + } + inarr[i] = arr[i].entry; + } + + PG_RETURN_TSVECTOR(in); +} + +Datum +tsvectorout(PG_FUNCTION_ARGS) +{ + TSVector out = PG_GETARG_TSVECTOR(0); + char *outbuf; + int4 i, + lenbuf = 0, + pp; + WordEntry *ptr = ARRPTR(out); + char *curbegin, + *curin, + *curout; + + lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; + for (i = 0; i < out->size; i++) + { + lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ; + if (ptr[i].haspos) + lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i])); + } + + curout = outbuf = (char *) palloc(lenbuf); + for (i = 0; i < out->size; i++) + { + curbegin = curin = STRPTR(out) + ptr->pos; + if (i != 0) + *curout++ = ' '; + *curout++ = '\''; + while (curin - curbegin < ptr->len) + { + int len = pg_mblen(curin); + + if (t_iseq(curin, '\'')) + *curout++ = '\''; + + while (len--) + *curout++ = *curin++; + } + + *curout++ = '\''; + if ((pp = POSDATALEN(out, ptr)) != 0) + { + WordEntryPos *wptr; + + *curout++ = ':'; + wptr = POSDATAPTR(out, ptr); + while (pp) + { + curout += sprintf(curout, "%d", WEP_GETPOS(*wptr)); + switch (WEP_GETWEIGHT(*wptr)) + { + case 3: + *curout++ = 'A'; + break; + case 2: + *curout++ = 'B'; + break; + case 1: + *curout++ = 'C'; + break; + case 0: + default: + break; + } + + if (pp > 1) + *curout++ = ','; + pp--; + wptr++; + } + } + ptr++; + } + + *curout = '\0'; + PG_FREE_IF_COPY(out, 0); + PG_RETURN_CSTRING(outbuf); +} + +Datum +tsvectorsend(PG_FUNCTION_ARGS) +{ + TSVector vec = PG_GETARG_TSVECTOR(0); + StringInfoData buf; + int i, + j; + WordEntry *weptr = ARRPTR(vec); + + pq_begintypsend(&buf); + + pq_sendint(&buf, vec->size, sizeof(int32)); + for (i = 0; i < vec->size; i++) + { + /* + * We are sure that sizeof(WordEntry) == sizeof(int32) + */ + pq_sendint(&buf, *(int32 *) weptr, sizeof(int32)); + + pq_sendbytes(&buf, STRPTR(vec) + weptr->pos, weptr->len); + if (weptr->haspos) + { + WordEntryPos *wepptr = POSDATAPTR(vec, weptr); + + pq_sendint(&buf, POSDATALEN(vec, weptr), sizeof(WordEntryPos)); + for (j = 0; j < POSDATALEN(vec, weptr); j++) + pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos)); + } + weptr++; + } + + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); +} + +Datum +tsvectorrecv(PG_FUNCTION_ARGS) +{ + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + TSVector vec; + int i, + size, + len = DATAHDRSIZE; + WordEntry *weptr; + int datalen = 0; + + size = pq_getmsgint(buf, sizeof(uint32)); + if (size < 0 || size > (MaxAllocSize / sizeof(WordEntry))) + elog(ERROR, "invalid size of tsvector"); + + len += sizeof(WordEntry) * size; + + len *= 2; + vec = (TSVector) palloc0(len); + vec->size = size; + + weptr = ARRPTR(vec); + for (i = 0; i < size; i++) + { + int tmp; + + weptr = ARRPTR(vec) + i; + + /* + * We are sure that sizeof(WordEntry) == sizeof(int32) + */ + tmp = pq_getmsgint(buf, sizeof(int32)); + *weptr = *(WordEntry *) & tmp; + + while (CALCDATASIZE(size, datalen + SHORTALIGN(weptr->len)) >= len) + { + len *= 2; + vec = (TSVector) repalloc(vec, len); + weptr = ARRPTR(vec) + i; + } + + memcpy(STRPTR(vec) + weptr->pos, + pq_getmsgbytes(buf, weptr->len), + weptr->len); + datalen += SHORTALIGN(weptr->len); + + if (i > 0 && WordEntryCMP(weptr, weptr - 1, STRPTR(vec)) <= 0) + elog(ERROR, "lexemes are unordered"); + + if (weptr->haspos) + { + uint16 j, + npos; + WordEntryPos *wepptr; + + npos = (uint16) pq_getmsgint(buf, sizeof(int16)); + if (npos > MAXNUMPOS) + elog(ERROR, "unexpected number of positions"); + + while (CALCDATASIZE(size, datalen + (npos + 1) * sizeof(WordEntryPos)) >= len) + { + len *= 2; + vec = (TSVector) repalloc(vec, len); + weptr = ARRPTR(vec) + i; + } + + memcpy(_POSDATAPTR(vec, weptr), &npos, sizeof(int16)); + wepptr = POSDATAPTR(vec, weptr); + for (j = 0; j < npos; j++) + { + wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(int16)); + if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1])) + elog(ERROR, "position information is unordered"); + } + + datalen += (npos + 1) * sizeof(WordEntry); + } + } + + SET_VARSIZE(vec, CALCDATASIZE(vec->size, datalen)); + + PG_RETURN_TSVECTOR(vec); +} diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c new file mode 100644 index 00000000000..341247bec75 --- /dev/null +++ b/src/backend/utils/adt/tsvector_op.c @@ -0,0 +1,1334 @@ +/*------------------------------------------------------------------------- + * + * tsvector_op.c + * operations over tsvector + * + * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.1 2007/08/21 01:11:19 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "catalog/namespace.h" +#include "commands/trigger.h" +#include "executor/spi.h" +#include "funcapi.h" +#include "mb/pg_wchar.h" +#include "tsearch/ts_type.h" +#include "tsearch/ts_utils.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" + + +typedef struct +{ + WordEntry *arrb; + WordEntry *arre; + char *values; + char *operand; +} CHKVAL; + +typedef struct +{ + uint32 cur; + TSVector stat; +} StatStorage; + +typedef struct +{ + uint32 len; + uint32 pos; + uint32 ndoc; + uint32 nentry; +} StatEntry; + +typedef struct +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int4 size; + int4 weight; + char data[1]; +} tsstat; + +#define STATHDRSIZE (sizeof(int4) * 4) +#define CALCSTATSIZE(x, lenstr) ( (x) * sizeof(StatEntry) + STATHDRSIZE + (lenstr) ) +#define STATPTR(x) ( (StatEntry*) ( (char*)(x) + STATHDRSIZE ) ) +#define STATSTRPTR(x) ( (char*)(x) + STATHDRSIZE + ( sizeof(StatEntry) * ((TSVector)(x))->size ) ) +#define STATSTRSIZE(x) ( VARSIZE((TSVector)(x)) - STATHDRSIZE - ( sizeof(StatEntry) * ((TSVector)(x))->size ) ) + + +static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column); + + +static int +silly_cmp_tsvector(const TSVector a, const TSVector b) +{ + if (VARSIZE(a) < VARSIZE(b)) + return -1; + else if (VARSIZE(a) > VARSIZE(b)) + return 1; + else if (a->size < b->size) + return -1; + else if (a->size > b->size) + return 1; + else + { + WordEntry *aptr = ARRPTR(a); + WordEntry *bptr = ARRPTR(b); + int i = 0; + int res; + + + for (i = 0; i < a->size; i++) + { + if (aptr->haspos != bptr->haspos) + { + return (aptr->haspos > bptr->haspos) ? -1 : 1; + } + else if (aptr->len != bptr->len) + { + return (aptr->len > bptr->len) ? -1 : 1; + } + else if ((res = strncmp(STRPTR(a) + aptr->pos, STRPTR(b) + bptr->pos, bptr->len)) != 0) + { + return res; + } + else if (aptr->haspos) + { + WordEntryPos *ap = POSDATAPTR(a, aptr); + WordEntryPos *bp = POSDATAPTR(b, bptr); + int j; + + if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr)) + return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1; + + for (j = 0; j < POSDATALEN(a, aptr); j++) + { + if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp)) + { + return (WEP_GETPOS(*ap) > WEP_GETPOS(*bp)) ? -1 : 1; + } + else if (WEP_GETWEIGHT(*ap) != WEP_GETWEIGHT(*bp)) + { + return (WEP_GETWEIGHT(*ap) > WEP_GETWEIGHT(*bp)) ? -1 : 1; + } + ap++, bp++; + } + } + + aptr++; + bptr++; + } + } + + return 0; +} + +#define TSVECTORCMPFUNC( type, action, ret ) \ +Datum \ +tsvector_##type(PG_FUNCTION_ARGS) \ +{ \ + TSVector a = PG_GETARG_TSVECTOR(0); \ + TSVector b = PG_GETARG_TSVECTOR(1); \ + int res = silly_cmp_tsvector(a, b); \ + PG_FREE_IF_COPY(a,0); \ + PG_FREE_IF_COPY(b,1); \ + PG_RETURN_##ret( res action 0 ); \ +} + +TSVECTORCMPFUNC(lt, <, BOOL); +TSVECTORCMPFUNC(le, <=, BOOL); +TSVECTORCMPFUNC(eq, ==, BOOL); +TSVECTORCMPFUNC(ge, >=, BOOL); +TSVECTORCMPFUNC(gt, >, BOOL); +TSVECTORCMPFUNC(ne, !=, BOOL); +TSVECTORCMPFUNC(cmp, +, INT32); + +Datum +tsvector_strip(PG_FUNCTION_ARGS) +{ + TSVector in = PG_GETARG_TSVECTOR(0); + TSVector out; + int i, + len = 0; + WordEntry *arrin = ARRPTR(in), + *arrout; + char *cur; + + for (i = 0; i < in->size; i++) + len += SHORTALIGN(arrin[i].len); + + len = CALCDATASIZE(in->size, len); + out = (TSVector) palloc0(len); + SET_VARSIZE(out, len); + out->size = in->size; + arrout = ARRPTR(out); + cur = STRPTR(out); + for (i = 0; i < in->size; i++) + { + memcpy(cur, STRPTR(in) + arrin[i].pos, arrin[i].len); + arrout[i].haspos = 0; + arrout[i].len = arrin[i].len; + arrout[i].pos = cur - STRPTR(out); + cur += SHORTALIGN(arrout[i].len); + } + + PG_FREE_IF_COPY(in, 0); + PG_RETURN_POINTER(out); +} + +Datum +tsvector_length(PG_FUNCTION_ARGS) +{ + TSVector in = PG_GETARG_TSVECTOR(0); + int4 ret = in->size; + + PG_FREE_IF_COPY(in, 0); + PG_RETURN_INT32(ret); +} + +Datum +tsvector_setweight(PG_FUNCTION_ARGS) +{ + TSVector in = PG_GETARG_TSVECTOR(0); + char cw = PG_GETARG_CHAR(1); + TSVector out; + int i, + j; + WordEntry *entry; + WordEntryPos *p; + int w = 0; + + switch (cw) + { + case 'A': + case 'a': + w = 3; + break; + case 'B': + case 'b': + w = 2; + break; + case 'C': + case 'c': + w = 1; + break; + case 'D': + case 'd': + w = 0; + break; + /* internal error */ + default: + elog(ERROR, "unrecognized weight"); + } + + out = (TSVector) palloc(VARSIZE(in)); + memcpy(out, in, VARSIZE(in)); + entry = ARRPTR(out); + i = out->size; + while (i--) + { + if ((j = POSDATALEN(out, entry)) != 0) + { + p = POSDATAPTR(out, entry); + while (j--) + { + WEP_SETWEIGHT(*p, w); + p++; + } + } + entry++; + } + + PG_FREE_IF_COPY(in, 0); + PG_RETURN_POINTER(out); +} + +static int +compareEntry(char *ptra, WordEntry * a, char *ptrb, WordEntry * b) +{ + if (a->len == b->len) + { + return strncmp( + ptra + a->pos, + ptrb + b->pos, + a->len); + } + return (a->len > b->len) ? 1 : -1; +} + +static int4 +add_pos(TSVector src, WordEntry * srcptr, TSVector dest, WordEntry * destptr, int4 maxpos) +{ + uint16 *clen = (uint16 *) _POSDATAPTR(dest, destptr); + int i; + uint16 slen = POSDATALEN(src, srcptr), + startlen; + WordEntryPos *spos = POSDATAPTR(src, srcptr), + *dpos = POSDATAPTR(dest, destptr); + + if (!destptr->haspos) + *clen = 0; + + startlen = *clen; + for (i = 0; i < slen && *clen < MAXNUMPOS && (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1); i++) + { + WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i])); + WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos)); + (*clen)++; + } + + if (*clen != startlen) + destptr->haspos = 1; + return *clen - startlen; +} + + +Datum +tsvector_concat(PG_FUNCTION_ARGS) +{ + TSVector in1 = PG_GETARG_TSVECTOR(0); + TSVector in2 = PG_GETARG_TSVECTOR(1); + TSVector out; + WordEntry *ptr; + WordEntry *ptr1, + *ptr2; + WordEntryPos *p; + int maxpos = 0, + i, + j, + i1, + i2; + char *cur; + char *data, + *data1, + *data2; + + ptr = ARRPTR(in1); + i = in1->size; + while (i--) + { + if ((j = POSDATALEN(in1, ptr)) != 0) + { + p = POSDATAPTR(in1, ptr); + while (j--) + { + if (WEP_GETPOS(*p) > maxpos) + maxpos = WEP_GETPOS(*p); + p++; + } + } + ptr++; + } + + ptr1 = ARRPTR(in1); + ptr2 = ARRPTR(in2); + data1 = STRPTR(in1); + data2 = STRPTR(in2); + i1 = in1->size; + i2 = in2->size; + out = (TSVector) palloc0(VARSIZE(in1) + VARSIZE(in2)); + SET_VARSIZE(out, VARSIZE(in1) + VARSIZE(in2)); + out->size = in1->size + in2->size; + data = cur = STRPTR(out); + ptr = ARRPTR(out); + while (i1 && i2) + { + int cmp = compareEntry(data1, ptr1, data2, ptr2); + + if (cmp < 0) + { /* in1 first */ + ptr->haspos = ptr1->haspos; + ptr->len = ptr1->len; + memcpy(cur, data1 + ptr1->pos, ptr1->len); + ptr->pos = cur - data; + cur += SHORTALIGN(ptr1->len); + if (ptr->haspos) + { + memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); + cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); + } + ptr++; + ptr1++; + i1--; + } + else if (cmp > 0) + { /* in2 first */ + ptr->haspos = ptr2->haspos; + ptr->len = ptr2->len; + memcpy(cur, data2 + ptr2->pos, ptr2->len); + ptr->pos = cur - data; + cur += SHORTALIGN(ptr2->len); + if (ptr->haspos) + { + int addlen = add_pos(in2, ptr2, out, ptr, maxpos); + + if (addlen == 0) + ptr->haspos = 0; + else + cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); + } + ptr++; + ptr2++; + i2--; + } + else + { + ptr->haspos = ptr1->haspos | ptr2->haspos; + ptr->len = ptr1->len; + memcpy(cur, data1 + ptr1->pos, ptr1->len); + ptr->pos = cur - data; + cur += SHORTALIGN(ptr1->len); + if (ptr->haspos) + { + if (ptr1->haspos) + { + memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); + cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); + if (ptr2->haspos) + cur += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos); + } + else if (ptr2->haspos) + { + int addlen = add_pos(in2, ptr2, out, ptr, maxpos); + + if (addlen == 0) + ptr->haspos = 0; + else + cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); + } + } + ptr++; + ptr1++; + ptr2++; + i1--; + i2--; + } + } + + while (i1) + { + ptr->haspos = ptr1->haspos; + ptr->len = ptr1->len; + memcpy(cur, data1 + ptr1->pos, ptr1->len); + ptr->pos = cur - data; + cur += SHORTALIGN(ptr1->len); + if (ptr->haspos) + { + memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); + cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); + } + ptr++; + ptr1++; + i1--; + } + + while (i2) + { + ptr->haspos = ptr2->haspos; + ptr->len = ptr2->len; + memcpy(cur, data2 + ptr2->pos, ptr2->len); + ptr->pos = cur - data; + cur += SHORTALIGN(ptr2->len); + if (ptr->haspos) + { + int addlen = add_pos(in2, ptr2, out, ptr, maxpos); + + if (addlen == 0) + ptr->haspos = 0; + else + cur += addlen * sizeof(WordEntryPos) + sizeof(uint16); + } + ptr++; + ptr2++; + i2--; + } + + out->size = ptr - ARRPTR(out); + SET_VARSIZE(out, CALCDATASIZE(out->size, cur - data)); + if (data != STRPTR(out)) + memmove(STRPTR(out), data, cur - data); + + PG_FREE_IF_COPY(in1, 0); + PG_FREE_IF_COPY(in2, 1); + PG_RETURN_POINTER(out); +} + +/* + * compare 2 string values + */ +static int4 +ValCompare(CHKVAL * chkval, WordEntry * ptr, QueryItem * item) +{ + if (ptr->len == item->length) + return strncmp( + &(chkval->values[ptr->pos]), + &(chkval->operand[item->distance]), + item->length); + + return (ptr->len > item->length) ? 1 : -1; +} + +/* + * check weight info + */ +static bool +checkclass_str(CHKVAL * chkval, WordEntry * val, QueryItem * item) +{ + WordEntryPos *ptr = (WordEntryPos *) (chkval->values + val->pos + SHORTALIGN(val->len) + sizeof(uint16)); + uint16 len = *((uint16 *) (chkval->values + val->pos + SHORTALIGN(val->len))); + + while (len--) + { + if (item->weight & (1 << WEP_GETWEIGHT(*ptr))) + return true; + ptr++; + } + return false; +} + +/* + * is there value 'val' in array or not ? + */ +static bool +checkcondition_str(void *checkval, QueryItem * val) +{ + WordEntry *StopLow = ((CHKVAL *) checkval)->arrb; + WordEntry *StopHigh = ((CHKVAL *) checkval)->arre; + WordEntry *StopMiddle; + int difference; + + /* Loop invariant: StopLow <= val < StopHigh */ + + while (StopLow < StopHigh) + { + StopMiddle = StopLow + (StopHigh - StopLow) / 2; + difference = ValCompare((CHKVAL *) checkval, StopMiddle, val); + if (difference == 0) + return (val->weight && StopMiddle->haspos) ? + checkclass_str((CHKVAL *) checkval, StopMiddle, val) : true; + else if (difference < 0) + StopLow = StopMiddle + 1; + else + StopHigh = StopMiddle; + } + + return (false); +} + +/* + * check for boolean condition + */ +bool +TS_execute(QueryItem * curitem, void *checkval, bool calcnot, bool (*chkcond) (void *checkval, QueryItem * val)) +{ + if (curitem->type == VAL) + return chkcond(checkval, curitem); + else if (curitem->val == (int4) '!') + { + return (calcnot) ? + ((TS_execute(curitem + 1, checkval, calcnot, chkcond)) ? false : true) + : true; + } + else if (curitem->val == (int4) '&') + { + if (TS_execute(curitem + curitem->left, checkval, calcnot, chkcond)) + return TS_execute(curitem + 1, checkval, calcnot, chkcond); + else + return false; + } + else + { /* |-operator */ + if (TS_execute(curitem + curitem->left, checkval, calcnot, chkcond)) + return true; + else + return TS_execute(curitem + 1, checkval, calcnot, chkcond); + } + return false; +} + +/* + * boolean operations + */ +Datum +ts_match_qv(PG_FUNCTION_ARGS) +{ + PG_RETURN_DATUM(DirectFunctionCall2(ts_match_vq, + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(0))); +} + +Datum +ts_match_vq(PG_FUNCTION_ARGS) +{ + TSVector val = PG_GETARG_TSVECTOR(0); + TSQuery query = PG_GETARG_TSQUERY(1); + CHKVAL chkval; + bool result; + + if (!val->size || !query->size) + { + PG_FREE_IF_COPY(val, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_BOOL(false); + } + + chkval.arrb = ARRPTR(val); + chkval.arre = chkval.arrb + val->size; + chkval.values = STRPTR(val); + chkval.operand = GETOPERAND(query); + result = TS_execute( + GETQUERY(query), + &chkval, + true, + checkcondition_str + ); + + PG_FREE_IF_COPY(val, 0); + PG_FREE_IF_COPY(query, 1); + PG_RETURN_BOOL(result); +} + +Datum +ts_match_tt(PG_FUNCTION_ARGS) +{ + TSVector vector; + TSQuery query; + bool res; + + vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector, + PG_GETARG_DATUM(0))); + query = DatumGetTSQuery(DirectFunctionCall1(plainto_tsquery, + PG_GETARG_DATUM(1))); + + res = DatumGetBool(DirectFunctionCall2(ts_match_vq, + TSVectorGetDatum(vector), + TSQueryGetDatum(query))); + + pfree(vector); + pfree(query); + + PG_RETURN_BOOL(res); +} + +Datum +ts_match_tq(PG_FUNCTION_ARGS) +{ + TSVector vector; + TSQuery query = PG_GETARG_TSQUERY(1); + bool res; + + vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector, + PG_GETARG_DATUM(0))); + + res = DatumGetBool(DirectFunctionCall2(ts_match_vq, + TSVectorGetDatum(vector), + TSQueryGetDatum(query))); + + pfree(vector); + PG_FREE_IF_COPY(query, 1); + + PG_RETURN_BOOL(res); +} + +/* + * Statistics of tsvector + */ +static int +check_weight(TSVector txt, WordEntry * wptr, int8 weight) +{ + int len = POSDATALEN(txt, wptr); + int num = 0; + WordEntryPos *ptr = POSDATAPTR(txt, wptr); + + while (len--) + { + if (weight & (1 << WEP_GETWEIGHT(*ptr))) + num++; + ptr++; + } + return num; +} + +static WordEntry ** +SEI_realloc(WordEntry ** in, uint32 *len) +{ + if (*len == 0 || in == NULL) + { + *len = 8; + in = palloc(sizeof(WordEntry *) * (*len)); + } + else + { + *len *= 2; + in = repalloc(in, sizeof(WordEntry *) * (*len)); + } + return in; +} + +static int +compareStatWord(StatEntry * a, WordEntry * b, tsstat * stat, TSVector txt) +{ + if (a->len == b->len) + return strncmp( + STATSTRPTR(stat) + a->pos, + STRPTR(txt) + b->pos, + a->len + ); + return (a->len > b->len) ? 1 : -1; +} + +static tsstat * +formstat(tsstat * stat, TSVector txt, WordEntry ** entry, uint32 len) +{ + tsstat *newstat; + uint32 totallen, + nentry; + uint32 slen = 0; + WordEntry **ptr = entry; + char *curptr; + StatEntry *sptr, + *nptr; + + while (ptr - entry < len) + { + slen += (*ptr)->len; + ptr++; + } + + nentry = stat->size + len; + slen += STATSTRSIZE(stat); + totallen = CALCSTATSIZE(nentry, slen); + newstat = palloc(totallen); + SET_VARSIZE(newstat, totallen); + newstat->weight = stat->weight; + newstat->size = nentry; + + memcpy(STATSTRPTR(newstat), STATSTRPTR(stat), STATSTRSIZE(stat)); + curptr = STATSTRPTR(newstat) + STATSTRSIZE(stat); + + ptr = entry; + sptr = STATPTR(stat); + nptr = STATPTR(newstat); + + if (len == 1) + { + StatEntry *StopLow = STATPTR(stat); + StatEntry *StopHigh = (StatEntry *) STATSTRPTR(stat); + + while (StopLow < StopHigh) + { + sptr = StopLow + (StopHigh - StopLow) / 2; + if (compareStatWord(sptr, *ptr, stat, txt) < 0) + StopLow = sptr + 1; + else + StopHigh = sptr; + } + nptr = STATPTR(newstat) + (StopLow - STATPTR(stat)); + memcpy(STATPTR(newstat), STATPTR(stat), sizeof(StatEntry) * (StopLow - STATPTR(stat))); + if ((*ptr)->haspos) + nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr); + else + nptr->nentry = 1; + nptr->ndoc = 1; + nptr->len = (*ptr)->len; + memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len); + nptr->pos = curptr - STATSTRPTR(newstat); + memcpy(nptr + 1, StopLow, sizeof(StatEntry) * (((StatEntry *) STATSTRPTR(stat)) - StopLow)); + } + else + { + while (sptr - STATPTR(stat) < stat->size && ptr - entry < len) + { + if (compareStatWord(sptr, *ptr, stat, txt) < 0) + { + memcpy(nptr, sptr, sizeof(StatEntry)); + sptr++; + } + else + { + if ((*ptr)->haspos) + nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr); + else + nptr->nentry = 1; + nptr->ndoc = 1; + nptr->len = (*ptr)->len; + memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len); + nptr->pos = curptr - STATSTRPTR(newstat); + curptr += nptr->len; + ptr++; + } + nptr++; + } + + memcpy(nptr, sptr, sizeof(StatEntry) * (stat->size - (sptr - STATPTR(stat)))); + + while (ptr - entry < len) + { + if ((*ptr)->haspos) + nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr); + else + nptr->nentry = 1; + nptr->ndoc = 1; + nptr->len = (*ptr)->len; + memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len); + nptr->pos = curptr - STATSTRPTR(newstat); + curptr += nptr->len; + ptr++; + nptr++; + } + } + + return newstat; +} + +static tsstat * +ts_accum(tsstat * stat, Datum data) +{ + tsstat *newstat; + TSVector txt = DatumGetTSVector(data); + WordEntry **newentry = NULL; + uint32 len = 0, + cur = 0; + StatEntry *sptr; + WordEntry *wptr; + int n = 0; + + if (stat == NULL) + { /* Init in first */ + stat = palloc(STATHDRSIZE); + SET_VARSIZE(stat, STATHDRSIZE); + stat->size = 0; + stat->weight = 0; + } + + /* simple check of correctness */ + if (txt == NULL || txt->size == 0) + { + if (txt != (TSVector) DatumGetPointer(data)) + pfree(txt); + return stat; + } + + sptr = STATPTR(stat); + wptr = ARRPTR(txt); + + if (stat->size < 100 * txt->size) + { /* merge */ + while (sptr - STATPTR(stat) < stat->size && wptr - ARRPTR(txt) < txt->size) + { + int cmp = compareStatWord(sptr, wptr, stat, txt); + + if (cmp < 0) + sptr++; + else if (cmp == 0) + { + if (stat->weight == 0) + { + sptr->ndoc++; + sptr->nentry += (wptr->haspos) ? POSDATALEN(txt, wptr) : 1; + } + else if (wptr->haspos && (n = check_weight(txt, wptr, stat->weight)) != 0) + { + sptr->ndoc++; + sptr->nentry += n; + } + sptr++; + wptr++; + } + else + { + if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0) + { + if (cur == len) + newentry = SEI_realloc(newentry, &len); + newentry[cur] = wptr; + cur++; + } + wptr++; + } + } + + while (wptr - ARRPTR(txt) < txt->size) + { + if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0) + { + if (cur == len) + newentry = SEI_realloc(newentry, &len); + newentry[cur] = wptr; + cur++; + } + wptr++; + } + } + else + { /* search */ + while (wptr - ARRPTR(txt) < txt->size) + { + StatEntry *StopLow = STATPTR(stat); + StatEntry *StopHigh = (StatEntry *) STATSTRPTR(stat); + int cmp; + + while (StopLow < StopHigh) + { + sptr = StopLow + (StopHigh - StopLow) / 2; + cmp = compareStatWord(sptr, wptr, stat, txt); + if (cmp == 0) + { + if (stat->weight == 0) + { + sptr->ndoc++; + sptr->nentry += (wptr->haspos) ? POSDATALEN(txt, wptr) : 1; + } + else if (wptr->haspos && (n = check_weight(txt, wptr, stat->weight)) != 0) + { + sptr->ndoc++; + sptr->nentry += n; + } + break; + } + else if (cmp < 0) + StopLow = sptr + 1; + else + StopHigh = sptr; + } + + if (StopLow >= StopHigh) + { /* not found */ + if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0) + { + if (cur == len) + newentry = SEI_realloc(newentry, &len); + newentry[cur] = wptr; + cur++; + } + } + wptr++; + } + } + + + if (cur == 0) + { /* no new words */ + if (txt != (TSVector) DatumGetPointer(data)) + pfree(txt); + return stat; + } + + newstat = formstat(stat, txt, newentry, cur); + pfree(newentry); + + if (txt != (TSVector) DatumGetPointer(data)) + pfree(txt); + return newstat; +} + +static void +ts_setup_firstcall(FunctionCallInfo fcinfo, FuncCallContext *funcctx, + tsstat * stat) +{ + TupleDesc tupdesc; + MemoryContext oldcontext; + StatStorage *st; + + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + st = palloc(sizeof(StatStorage)); + st->cur = 0; + st->stat = palloc(VARSIZE(stat)); + memcpy(st->stat, stat, VARSIZE(stat)); + funcctx->user_fctx = (void *) st; + + tupdesc = CreateTemplateTupleDesc(3, false); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "word", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "ndoc", + INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "nentry", + INT4OID, -1, 0); + funcctx->tuple_desc = BlessTupleDesc(tupdesc); + funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc); + + MemoryContextSwitchTo(oldcontext); +} + + +static Datum +ts_process_call(FuncCallContext *funcctx) +{ + StatStorage *st; + + st = (StatStorage *) funcctx->user_fctx; + + if (st->cur < st->stat->size) + { + Datum result; + char *values[3]; + char ndoc[16]; + char nentry[16]; + StatEntry *entry = STATPTR(st->stat) + st->cur; + HeapTuple tuple; + + values[0] = palloc(entry->len + 1); + memcpy(values[0], STATSTRPTR(st->stat) + entry->pos, entry->len); + (values[0])[entry->len] = '\0'; + sprintf(ndoc, "%d", entry->ndoc); + values[1] = ndoc; + sprintf(nentry, "%d", entry->nentry); + values[2] = nentry; + + tuple = BuildTupleFromCStrings(funcctx->attinmeta, values); + result = HeapTupleGetDatum(tuple); + + pfree(values[0]); + st->cur++; + return result; + } + else + { + pfree(st->stat); + pfree(st); + } + + return (Datum) 0; +} + +static tsstat * +ts_stat_sql(text *txt, text *ws) +{ + char *query = TextPGetCString(txt); + int i; + tsstat *newstat, + *stat; + bool isnull; + Portal portal; + void *plan; + + if ((plan = SPI_prepare(query, 0, NULL)) == NULL) + /* internal error */ + elog(ERROR, "SPI_prepare(\"%s\") failed", query); + + if ((portal = SPI_cursor_open(NULL, plan, NULL, NULL, false)) == NULL) + /* internal error */ + elog(ERROR, "SPI_cursor_open(\"%s\") failed", query); + + SPI_cursor_fetch(portal, true, 100); + + if (SPI_tuptable->tupdesc->natts != 1 || + SPI_gettypeid(SPI_tuptable->tupdesc, 1) != TSVECTOROID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("ts_stat query must return one tsvector column"))); + + stat = palloc(STATHDRSIZE); + SET_VARSIZE(stat, STATHDRSIZE); + stat->size = 0; + stat->weight = 0; + + if (ws) + { + char *buf; + + buf = VARDATA(ws); + while (buf - VARDATA(ws) < VARSIZE(ws) - VARHDRSZ) + { + if (pg_mblen(buf) == 1) + { + switch (*buf) + { + case 'A': + case 'a': + stat->weight |= 1 << 3; + break; + case 'B': + case 'b': + stat->weight |= 1 << 2; + break; + case 'C': + case 'c': + stat->weight |= 1 << 1; + break; + case 'D': + case 'd': + stat->weight |= 1; + break; + default: + stat->weight |= 0; + } + } + buf += pg_mblen(buf); + } + } + + while (SPI_processed > 0) + { + for (i = 0; i < SPI_processed; i++) + { + Datum data = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull); + + if (!isnull) + { + newstat = ts_accum(stat, data); + if (stat != newstat && stat) + pfree(stat); + stat = newstat; + } + } + + SPI_freetuptable(SPI_tuptable); + SPI_cursor_fetch(portal, true, 100); + } + + SPI_freetuptable(SPI_tuptable); + SPI_cursor_close(portal); + SPI_freeplan(plan); + pfree(query); + + return stat; +} + +Datum +ts_stat1(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + Datum result; + + if (SRF_IS_FIRSTCALL()) + { + tsstat *stat; + text *txt = PG_GETARG_TEXT_P(0); + + funcctx = SRF_FIRSTCALL_INIT(); + SPI_connect(); + stat = ts_stat_sql(txt, NULL); + PG_FREE_IF_COPY(txt, 0); + ts_setup_firstcall(fcinfo, funcctx, stat); + SPI_finish(); + } + + funcctx = SRF_PERCALL_SETUP(); + if ((result = ts_process_call(funcctx)) != (Datum) 0) + SRF_RETURN_NEXT(funcctx, result); + SRF_RETURN_DONE(funcctx); +} + +Datum +ts_stat2(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + Datum result; + + if (SRF_IS_FIRSTCALL()) + { + tsstat *stat; + text *txt = PG_GETARG_TEXT_P(0); + text *ws = PG_GETARG_TEXT_P(1); + + funcctx = SRF_FIRSTCALL_INIT(); + SPI_connect(); + stat = ts_stat_sql(txt, ws); + PG_FREE_IF_COPY(txt, 0); + PG_FREE_IF_COPY(ws, 1); + ts_setup_firstcall(fcinfo, funcctx, stat); + SPI_finish(); + } + + funcctx = SRF_PERCALL_SETUP(); + if ((result = ts_process_call(funcctx)) != (Datum) 0) + SRF_RETURN_NEXT(funcctx, result); + SRF_RETURN_DONE(funcctx); +} + + +/* Check if datatype is TEXT or binary-equivalent to it */ +static bool +istexttype(Oid typid) +{ + /* varchar(n) and char(n) are binary-compatible with text */ + if (typid==TEXTOID || typid==VARCHAROID || typid==BPCHAROID) + return true; + /* Allow domains over these types, too */ + typid = getBaseType(typid); + if (typid==TEXTOID || typid==VARCHAROID || typid==BPCHAROID) + return true; + return false; +} + + +/* + * Triggers for automatic update of a tsvector column from text column(s) + * + * Trigger arguments are either + * name of tsvector col, name of tsconfig to use, name(s) of text col(s) + * name of tsvector col, name of regconfig col, name(s) of text col(s) + * ie, tsconfig can either be specified by name, or indirectly as the + * contents of a regconfig field in the row. If the name is used, it must + * be explicitly schema-qualified. + */ +Datum +tsvector_update_trigger_byid(PG_FUNCTION_ARGS) +{ + return tsvector_update_trigger(fcinfo, false); +} + +Datum +tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS) +{ + return tsvector_update_trigger(fcinfo, true); +} + +static Datum +tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column) +{ + TriggerData *trigdata; + Trigger *trigger; + Relation rel; + HeapTuple rettuple = NULL; + int tsvector_attr_num, + i; + ParsedText prs; + Datum datum; + bool isnull; + text *txt; + Oid cfgId; + + /* Check call context */ + if (!CALLED_AS_TRIGGER(fcinfo)) /* internal error */ + elog(ERROR, "tsvector_update_trigger: not fired by trigger manager"); + + trigdata = (TriggerData *) fcinfo->context; + if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event)) + elog(ERROR, "tsvector_update_trigger: can't process STATEMENT events"); + if (TRIGGER_FIRED_AFTER(trigdata->tg_event)) + elog(ERROR, "tsvector_update_trigger: must be fired BEFORE event"); + + if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event)) + rettuple = trigdata->tg_trigtuple; + else if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event)) + rettuple = trigdata->tg_newtuple; + else + elog(ERROR, "tsvector_update_trigger: must be fired for INSERT or UPDATE"); + + trigger = trigdata->tg_trigger; + rel = trigdata->tg_relation; + + if (trigger->tgnargs < 3) + elog(ERROR, "tsvector_update_trigger: arguments must be tsvector_field, ts_config, text_field1, ...)"); + + /* Find the target tsvector column */ + tsvector_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[0]); + if (tsvector_attr_num == SPI_ERROR_NOATTRIBUTE) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("tsvector column \"%s\" does not exist", + trigger->tgargs[0]))); + if (SPI_gettypeid(rel->rd_att, tsvector_attr_num) != TSVECTOROID) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("column \"%s\" is not of tsvector type", + trigger->tgargs[0]))); + + /* Find the configuration to use */ + if (config_column) + { + int config_attr_num; + + config_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[1]); + if (config_attr_num == SPI_ERROR_NOATTRIBUTE) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("config column \"%s\" does not exist", + trigger->tgargs[1]))); + if (SPI_gettypeid(rel->rd_att, config_attr_num) != REGCONFIGOID) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("column \"%s\" is not of regconfig type", + trigger->tgargs[1]))); + + datum = SPI_getbinval(rettuple, rel->rd_att, config_attr_num, &isnull); + if (isnull) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("config column \"%s\" must not be NULL", + trigger->tgargs[1]))); + cfgId = DatumGetObjectId(datum); + } + else + { + List *names; + + names = stringToQualifiedNameList(trigger->tgargs[1]); + /* require a schema so that results are not search path dependent */ + if (list_length(names) < 2) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("text search configuration name \"%s\" must be schema-qualified", + trigger->tgargs[1]))); + cfgId = TSConfigGetCfgid(names, false); + } + + /* initialize parse state */ + prs.lenwords = 32; + prs.curwords = 0; + prs.pos = 0; + prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); + + /* find all words in indexable column(s) */ + for (i = 2; i < trigger->tgnargs; i++) + { + int numattr; + + numattr = SPI_fnumber(rel->rd_att, trigger->tgargs[i]); + if (numattr == SPI_ERROR_NOATTRIBUTE) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("column \"%s\" does not exist", + trigger->tgargs[i]))); + if (!istexttype(SPI_gettypeid(rel->rd_att, numattr))) + ereport(ERROR, + (errcode(ERRCODE_DATATYPE_MISMATCH), + errmsg("column \"%s\" is not of character type", + trigger->tgargs[i]))); + + datum = SPI_getbinval(rettuple, rel->rd_att, numattr, &isnull); + if (isnull) + continue; + + txt = DatumGetTextP(datum); + + parsetext(cfgId, &prs, VARDATA(txt), VARSIZE(txt) - VARHDRSZ); + + if (txt != (text *) DatumGetPointer(datum)) + pfree(txt); + } + + /* make tsvector value */ + if (prs.curwords) + { + datum = PointerGetDatum(make_tsvector(&prs)); + rettuple = SPI_modifytuple(rel, rettuple, 1, &tsvector_attr_num, + &datum, NULL); + pfree(DatumGetPointer(datum)); + } + else + { + TSVector out = palloc(CALCDATASIZE(0, 0)); + + SET_VARSIZE(out, CALCDATASIZE(0, 0)); + out->size = 0; + datum = PointerGetDatum(out); + rettuple = SPI_modifytuple(rel, rettuple, 1, &tsvector_attr_num, + &datum, NULL); + pfree(prs.words); + } + + if (rettuple == NULL) /* internal error */ + elog(ERROR, "tsvector_update_trigger: %d returned by SPI_modifytuple", + SPI_result); + + return PointerGetDatum(rettuple); +} |