diff options
Diffstat (limited to 'src/backend/access/spgist/spgtextproc.c')
-rw-r--r-- | src/backend/access/spgist/spgtextproc.c | 594 |
1 files changed, 594 insertions, 0 deletions
diff --git a/src/backend/access/spgist/spgtextproc.c b/src/backend/access/spgist/spgtextproc.c new file mode 100644 index 00000000000..b6037978425 --- /dev/null +++ b/src/backend/access/spgist/spgtextproc.c @@ -0,0 +1,594 @@ +/*------------------------------------------------------------------------- + * + * spgtextproc.c + * implementation of compressed-suffix tree over text + * + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgtextproc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/spgist.h" +#include "catalog/pg_type.h" +#include "mb/pg_wchar.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/pg_locale.h" + + +/* + * In the worst case, a inner tuple in a text suffix tree could have as many + * as 256 nodes (one for each possible byte value). Each node can take 16 + * bytes on MAXALIGN=8 machines. The inner tuple must fit on an index page + * of size BLCKSZ. Rather than assuming we know the exact amount of overhead + * imposed by page headers, tuple headers, etc, we leave 100 bytes for that + * (the actual overhead should be no more than 56 bytes at this writing, so + * there is slop in this number). The upshot is that the maximum safe prefix + * length is this: + */ +#define SPGIST_MAX_PREFIX_LENGTH (BLCKSZ - 256 * 16 - 100) + +/* Struct for sorting values in picksplit */ +typedef struct spgNodePtr +{ + Datum d; + int i; + uint8 c; +} spgNodePtr; + + +Datum +spg_text_config(PG_FUNCTION_ARGS) +{ + /* spgConfigIn *cfgin = (spgConfigIn *) PG_GETARG_POINTER(0); */ + spgConfigOut *cfg = (spgConfigOut *) PG_GETARG_POINTER(1); + + cfg->prefixType = TEXTOID; + cfg->labelType = CHAROID; + cfg->longValuesOK = true; /* suffixing will shorten long values */ + PG_RETURN_VOID(); +} + +/* + * Form a text datum from the given not-necessarily-null-terminated string, + * using short varlena header format if possible + */ +static Datum +formTextDatum(const char *data, int datalen) +{ + char *p; + + p = (char *) palloc(datalen + VARHDRSZ); + + if (datalen + VARHDRSZ_SHORT <= VARATT_SHORT_MAX) + { + SET_VARSIZE_SHORT(p, datalen + VARHDRSZ_SHORT); + if (datalen) + memcpy(p + VARHDRSZ_SHORT, data, datalen); + } + else + { + SET_VARSIZE(p, datalen + VARHDRSZ); + memcpy(p + VARHDRSZ, data, datalen); + } + + return PointerGetDatum(p); +} + +/* + * Find the length of the common prefix of a and b + */ +static int +commonPrefix(const char *a, const char *b, int lena, int lenb) +{ + int i = 0; + + while (i < lena && i < lenb && *a == *b) + { + a++; + b++; + i++; + } + + return i; +} + +/* + * Binary search an array of uint8 datums for a match to c + * + * On success, *i gets the match location; on failure, it gets where to insert + */ +static bool +searchChar(Datum *nodeLabels, int nNodes, uint8 c, int *i) +{ + int StopLow = 0, + StopHigh = nNodes; + + while (StopLow < StopHigh) + { + int StopMiddle = (StopLow + StopHigh) >> 1; + uint8 middle = DatumGetUInt8(nodeLabels[StopMiddle]); + + if (c < middle) + StopHigh = StopMiddle; + else if (c > middle) + StopLow = StopMiddle + 1; + else + { + *i = StopMiddle; + return true; + } + } + + *i = StopHigh; + return false; +} + +Datum +spg_text_choose(PG_FUNCTION_ARGS) +{ + spgChooseIn *in = (spgChooseIn *) PG_GETARG_POINTER(0); + spgChooseOut *out = (spgChooseOut *) PG_GETARG_POINTER(1); + text *inText = DatumGetTextPP(in->datum); + char *inStr = VARDATA_ANY(inText); + int inSize = VARSIZE_ANY_EXHDR(inText); + uint8 nodeChar = '\0'; + int i = 0; + int commonLen = 0; + + /* Check for prefix match, set nodeChar to first byte after prefix */ + if (in->hasPrefix) + { + text *prefixText = DatumGetTextPP(in->prefixDatum); + char *prefixStr = VARDATA_ANY(prefixText); + int prefixSize = VARSIZE_ANY_EXHDR(prefixText); + + commonLen = commonPrefix(inStr + in->level, + prefixStr, + inSize - in->level, + prefixSize); + + if (commonLen == prefixSize) + { + if (inSize - in->level > commonLen) + nodeChar = *(uint8 *) (inStr + in->level + commonLen); + else + nodeChar = '\0'; + } + else + { + /* Must split tuple because incoming value doesn't match prefix */ + out->resultType = spgSplitTuple; + + if (commonLen == 0) + { + out->result.splitTuple.prefixHasPrefix = false; + } + else + { + out->result.splitTuple.prefixHasPrefix = true; + out->result.splitTuple.prefixPrefixDatum = + formTextDatum(prefixStr, commonLen); + } + out->result.splitTuple.nodeLabel = + UInt8GetDatum(*(prefixStr + commonLen)); + + if (prefixSize - commonLen == 1) + { + out->result.splitTuple.postfixHasPrefix = false; + } + else + { + out->result.splitTuple.postfixHasPrefix = true; + out->result.splitTuple.postfixPrefixDatum = + formTextDatum(prefixStr + commonLen + 1, + prefixSize - commonLen - 1); + } + + PG_RETURN_VOID(); + } + } + else if (inSize > in->level) + { + nodeChar = *(uint8 *) (inStr + in->level); + } + else + { + nodeChar = '\0'; + } + + /* Look up nodeChar in the node label array */ + if (searchChar(in->nodeLabels, in->nNodes, nodeChar, &i)) + { + /* + * Descend to existing node. (If in->allTheSame, the core code will + * ignore our nodeN specification here, but that's OK. We still + * have to provide the correct levelAdd and restDatum values, and + * those are the same regardless of which node gets chosen by core.) + */ + out->resultType = spgMatchNode; + out->result.matchNode.nodeN = i; + out->result.matchNode.levelAdd = commonLen + 1; + if (inSize - in->level - commonLen - 1 > 0) + out->result.matchNode.restDatum = + formTextDatum(inStr + in->level + commonLen + 1, + inSize - in->level - commonLen - 1); + else + out->result.matchNode.restDatum = + formTextDatum(NULL, 0); + } + else if (in->allTheSame) + { + /* + * Can't use AddNode action, so split the tuple. The upper tuple + * has the same prefix as before and uses an empty node label for + * the lower tuple. The lower tuple has no prefix and the same + * node labels as the original tuple. + */ + out->resultType = spgSplitTuple; + out->result.splitTuple.prefixHasPrefix = in->hasPrefix; + out->result.splitTuple.prefixPrefixDatum = in->prefixDatum; + out->result.splitTuple.nodeLabel = UInt8GetDatum('\0'); + out->result.splitTuple.postfixHasPrefix = false; + } + else + { + /* Add a node for the not-previously-seen nodeChar value */ + out->resultType = spgAddNode; + out->result.addNode.nodeLabel = UInt8GetDatum(nodeChar); + out->result.addNode.nodeN = i; + } + + PG_RETURN_VOID(); +} + +/* qsort comparator to sort spgNodePtr structs by "c" */ +static int +cmpNodePtr(const void *a, const void *b) +{ + const spgNodePtr *aa = (const spgNodePtr *) a; + const spgNodePtr *bb = (const spgNodePtr *) b; + + if (aa->c == bb->c) + return 0; + else if (aa->c > bb->c) + return 1; + else + return -1; +} + +Datum +spg_text_picksplit(PG_FUNCTION_ARGS) +{ + spgPickSplitIn *in = (spgPickSplitIn *) PG_GETARG_POINTER(0); + spgPickSplitOut *out = (spgPickSplitOut *) PG_GETARG_POINTER(1); + text *text0 = DatumGetTextPP(in->datums[0]); + int i, + commonLen; + spgNodePtr *nodes; + + /* Identify longest common prefix, if any */ + commonLen = VARSIZE_ANY_EXHDR(text0); + for (i = 1; i < in->nTuples && commonLen > 0; i++) + { + text *texti = DatumGetTextPP(in->datums[i]); + int tmp = commonPrefix(VARDATA_ANY(text0), + VARDATA_ANY(texti), + VARSIZE_ANY_EXHDR(text0), + VARSIZE_ANY_EXHDR(texti)); + + if (tmp < commonLen) + commonLen = tmp; + } + + /* + * Limit the prefix length, if necessary, to ensure that the resulting + * inner tuple will fit on a page. + */ + commonLen = Min(commonLen, SPGIST_MAX_PREFIX_LENGTH); + + /* Set node prefix to be that string, if it's not empty */ + if (commonLen == 0) + { + out->hasPrefix = false; + } + else + { + out->hasPrefix = true; + out->prefixDatum = formTextDatum(VARDATA_ANY(text0), commonLen); + } + + /* Extract the node label (first non-common byte) from each value */ + nodes = (spgNodePtr *) palloc(sizeof(spgNodePtr) * in->nTuples); + + for (i = 0; i < in->nTuples; i++) + { + text *texti = DatumGetTextPP(in->datums[i]); + + if (commonLen < VARSIZE_ANY_EXHDR(texti)) + nodes[i].c = *(uint8 *) (VARDATA_ANY(texti) + commonLen); + else + nodes[i].c = '\0'; /* use \0 if string is all common */ + nodes[i].i = i; + nodes[i].d = in->datums[i]; + } + + /* + * Sort by label bytes so that we can group the values into nodes. This + * also ensures that the nodes are ordered by label value, allowing the + * use of binary search in searchChar. + */ + qsort(nodes, in->nTuples, sizeof(*nodes), cmpNodePtr); + + /* And emit results */ + out->nNodes = 0; + out->nodeLabels = (Datum *) palloc(sizeof(Datum) * in->nTuples); + out->mapTuplesToNodes = (int *) palloc(sizeof(int) * in->nTuples); + out->leafTupleDatums = (Datum *) palloc(sizeof(Datum) * in->nTuples); + + for (i = 0; i < in->nTuples; i++) + { + text *texti = DatumGetTextPP(nodes[i].d); + Datum leafD; + + if (i == 0 || nodes[i].c != nodes[i - 1].c) + { + out->nodeLabels[out->nNodes] = UInt8GetDatum(nodes[i].c); + out->nNodes++; + } + + if (commonLen < VARSIZE_ANY_EXHDR(texti)) + leafD = formTextDatum(VARDATA_ANY(texti) + commonLen + 1, + VARSIZE_ANY_EXHDR(texti) - commonLen - 1); + else + leafD = formTextDatum(NULL, 0); + + out->leafTupleDatums[nodes[i].i] = leafD; + out->mapTuplesToNodes[nodes[i].i] = out->nNodes - 1; + } + + PG_RETURN_VOID(); +} + +Datum +spg_text_inner_consistent(PG_FUNCTION_ARGS) +{ + spgInnerConsistentIn *in = (spgInnerConsistentIn *) PG_GETARG_POINTER(0); + spgInnerConsistentOut *out = (spgInnerConsistentOut *) PG_GETARG_POINTER(1); + StrategyNumber strategy = in->strategy; + text *inText; + int inSize; + int i; + text *reconstrText = NULL; + int maxReconstrLen = 0; + text *prefixText = NULL; + int prefixSize = 0; + + /* + * If it's a collation-aware operator, but the collation is C, we can + * treat it as non-collation-aware. + */ + if (strategy > 10 && + lc_collate_is_c(PG_GET_COLLATION())) + strategy -= 10; + + inText = DatumGetTextPP(in->query); + inSize = VARSIZE_ANY_EXHDR(inText); + + /* + * Reconstruct values represented at this tuple, including parent data, + * prefix of this tuple if any, and the node label if any. in->level + * should be the length of the previously reconstructed value, and the + * number of bytes added here is prefixSize or prefixSize + 1. + * + * Note: we assume that in->reconstructedValue isn't toasted and doesn't + * have a short varlena header. This is okay because it must have been + * created by a previous invocation of this routine, and we always emit + * long-format reconstructed values. + */ + Assert(in->level == 0 ? DatumGetPointer(in->reconstructedValue) == NULL : + VARSIZE_ANY_EXHDR(DatumGetPointer(in->reconstructedValue)) == in->level); + + maxReconstrLen = in->level + 1; + if (in->hasPrefix) + { + prefixText = DatumGetTextPP(in->prefixDatum); + prefixSize = VARSIZE_ANY_EXHDR(prefixText); + maxReconstrLen += prefixSize; + } + + reconstrText = palloc(VARHDRSZ + maxReconstrLen); + SET_VARSIZE(reconstrText, VARHDRSZ + maxReconstrLen); + + if (in->level) + memcpy(VARDATA(reconstrText), + VARDATA(DatumGetPointer(in->reconstructedValue)), + in->level); + if (prefixSize) + memcpy(((char *) VARDATA(reconstrText)) + in->level, + VARDATA_ANY(prefixText), + prefixSize); + /* last byte of reconstrText will be filled in below */ + + /* + * Scan the child nodes. For each one, complete the reconstructed value + * and see if it's consistent with the query. If so, emit an entry into + * the output arrays. + */ + out->nodeNumbers = (int *) palloc(sizeof(int) * in->nNodes); + out->levelAdds = (int *) palloc(sizeof(int) * in->nNodes); + out->reconstructedValues = (Datum *) palloc(sizeof(Datum) * in->nNodes); + out->nNodes = 0; + + for (i = 0; i < in->nNodes; i++) + { + uint8 nodeChar = DatumGetUInt8(in->nodeLabels[i]); + int thisLen; + int r; + bool res = false; + + /* If nodeChar is zero, don't include it in data */ + if (nodeChar == '\0') + thisLen = maxReconstrLen - 1; + else + { + ((char *) VARDATA(reconstrText))[maxReconstrLen - 1] = nodeChar; + thisLen = maxReconstrLen; + } + + r = memcmp(VARDATA(reconstrText), VARDATA_ANY(inText), + Min(inSize, thisLen)); + + switch (strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + if (r <= 0) + res = true; + break; + case BTEqualStrategyNumber: + if (r == 0 && inSize >= thisLen) + res = true; + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + if (r >= 0) + res = true; + break; + case BTLessStrategyNumber + 10: + case BTLessEqualStrategyNumber + 10: + case BTGreaterEqualStrategyNumber + 10: + case BTGreaterStrategyNumber + 10: + /* + * with non-C collation we need to traverse whole tree :-( + */ + res = true; + break; + default: + elog(ERROR, "unrecognized strategy number: %d", + in->strategy); + break; + } + + if (res) + { + out->nodeNumbers[out->nNodes] = i; + out->levelAdds[out->nNodes] = thisLen - in->level; + SET_VARSIZE(reconstrText, VARHDRSZ + thisLen); + out->reconstructedValues[out->nNodes] = + datumCopy(PointerGetDatum(reconstrText), false, -1); + out->nNodes++; + } + } + + PG_RETURN_VOID(); +} + +Datum +spg_text_leaf_consistent(PG_FUNCTION_ARGS) +{ + spgLeafConsistentIn *in = (spgLeafConsistentIn *) PG_GETARG_POINTER(0); + spgLeafConsistentOut *out = (spgLeafConsistentOut *) PG_GETARG_POINTER(1); + StrategyNumber strategy = in->strategy; + text *query = DatumGetTextPP(in->query); + int level = in->level; + text *leafValue, + *reconstrValue = NULL; + char *fullValue; + int fullLen; + int queryLen; + int r; + bool res; + + /* all tests are exact */ + out->recheck = false; + + leafValue = DatumGetTextPP(in->leafDatum); + + if (DatumGetPointer(in->reconstructedValue)) + reconstrValue = DatumGetTextP(in->reconstructedValue); + + Assert(level == 0 ? reconstrValue == NULL : + VARSIZE_ANY_EXHDR(reconstrValue) == level); + + fullLen = level + VARSIZE_ANY_EXHDR(leafValue); + + queryLen = VARSIZE_ANY_EXHDR(query); + + /* For equality, we needn't reconstruct fullValue if not same length */ + if (strategy == BTEqualStrategyNumber && queryLen != fullLen) + PG_RETURN_BOOL(false); + + /* Else, reconstruct the full string represented by this leaf tuple */ + if (VARSIZE_ANY_EXHDR(leafValue) == 0 && level > 0) + { + fullValue = VARDATA(reconstrValue); + } + else + { + fullValue = palloc(fullLen); + if (level) + memcpy(fullValue, VARDATA(reconstrValue), level); + if (VARSIZE_ANY_EXHDR(leafValue) > 0) + memcpy(fullValue + level, VARDATA_ANY(leafValue), + VARSIZE_ANY_EXHDR(leafValue)); + } + + /* Run the appropriate type of comparison */ + if (strategy > 10) + { + /* Collation-aware comparison */ + strategy -= 10; + + /* If asserts are enabled, verify encoding of reconstructed string */ + Assert(pg_verifymbstr(fullValue, fullLen, false)); + + r = varstr_cmp(fullValue, Min(queryLen, fullLen), + VARDATA_ANY(query), Min(queryLen, fullLen), + PG_GET_COLLATION()); + } + else + { + /* Non-collation-aware comparison */ + r = memcmp(fullValue, VARDATA_ANY(query), Min(queryLen, fullLen)); + } + + if (r == 0) + { + if (queryLen > fullLen) + r = -1; + else if (queryLen < fullLen) + r = 1; + } + + switch (strategy) + { + case BTLessStrategyNumber: + res = (r < 0); + break; + case BTLessEqualStrategyNumber: + res = (r <= 0); + break; + case BTEqualStrategyNumber: + res = (r == 0); + break; + case BTGreaterEqualStrategyNumber: + res = (r >= 0); + break; + case BTGreaterStrategyNumber: + res = (r > 0); + break; + default: + elog(ERROR, "unrecognized strategy number: %d", in->strategy); + res = false; + break; + } + + PG_RETURN_BOOL(res); +} |