diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2011-12-17 16:41:16 -0500 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2011-12-17 16:42:30 -0500 |
commit | 8daeb5ddd698f661eb118f8e874e7c68cfd6ae09 (patch) | |
tree | 765599b73e45a6ca5529398489f31a534ab1924e /src/backend/access/spgist/spgscan.c | |
parent | 19fc0fe3ae7861a8b0d3ab8b67bd01fde33bf2da (diff) | |
download | postgresql-8daeb5ddd698f661eb118f8e874e7c68cfd6ae09.tar.gz postgresql-8daeb5ddd698f661eb118f8e874e7c68cfd6ae09.zip |
Add SP-GiST (space-partitioned GiST) index access method.
SP-GiST is comparable to GiST in flexibility, but supports non-balanced
partitioned search structures rather than balanced trees. As described at
PGCon 2011, this new indexing structure can beat GiST in both index build
time and query speed for search problems that it is well matched to.
There are a number of areas that could still use improvement, but at this
point the code seems committable.
Teodor Sigaev and Oleg Bartunov, with considerable revisions by Tom Lane
Diffstat (limited to 'src/backend/access/spgist/spgscan.c')
-rw-r--r-- | src/backend/access/spgist/spgscan.c | 543 |
1 files changed, 543 insertions, 0 deletions
diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c new file mode 100644 index 00000000000..1c6180b2d24 --- /dev/null +++ b/src/backend/access/spgist/spgscan.c @@ -0,0 +1,543 @@ +/*------------------------------------------------------------------------- + * + * spgscan.c + * routines for scanning SP-GiST indexes + * + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/spgist/spgscan.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/relscan.h" +#include "access/spgist_private.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "utils/datum.h" +#include "utils/memutils.h" + + +typedef struct ScanStackEntry +{ + Datum reconstructedValue; /* value reconstructed from parent */ + int level; /* level of items on this page */ + ItemPointerData ptr; /* block and offset to scan from */ +} ScanStackEntry; + + +/* Free a ScanStackEntry */ +static void +freeScanStackEntry(SpGistScanOpaque so, ScanStackEntry *stackEntry) +{ + if (!so->state.attType.attbyval && + DatumGetPointer(stackEntry->reconstructedValue) != NULL) + pfree(DatumGetPointer(stackEntry->reconstructedValue)); + pfree(stackEntry); +} + +/* Free the entire stack */ +static void +freeScanStack(SpGistScanOpaque so) +{ + ListCell *lc; + + foreach(lc, so->scanStack) + { + freeScanStackEntry(so, (ScanStackEntry *) lfirst(lc)); + } + list_free(so->scanStack); + so->scanStack = NIL; +} + +/* Initialize scanStack with a single entry for the root page */ +static void +resetSpGistScanOpaque(SpGistScanOpaque so) +{ + ScanStackEntry *startEntry = palloc0(sizeof(ScanStackEntry)); + + ItemPointerSet(&startEntry->ptr, SPGIST_HEAD_BLKNO, FirstOffsetNumber); + + freeScanStack(so); + so->scanStack = list_make1(startEntry); + so->nPtrs = so->iPtr = 0; +} + +Datum +spgbeginscan(PG_FUNCTION_ARGS) +{ + Relation rel = (Relation) PG_GETARG_POINTER(0); + int keysz = PG_GETARG_INT32(1); + /* ScanKey scankey = (ScanKey) PG_GETARG_POINTER(2); */ + IndexScanDesc scan; + SpGistScanOpaque so; + + scan = RelationGetIndexScan(rel, keysz, 0); + + so = (SpGistScanOpaque) palloc0(sizeof(SpGistScanOpaqueData)); + initSpGistState(&so->state, scan->indexRelation); + so->tempCxt = AllocSetContextCreate(CurrentMemoryContext, + "SP-GiST search temporary context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + resetSpGistScanOpaque(so); + scan->opaque = so; + + PG_RETURN_POINTER(scan); +} + +Datum +spgrescan(PG_FUNCTION_ARGS) +{ + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1); + + if (scankey && scan->numberOfKeys > 0) + { + memmove(scan->keyData, scankey, + scan->numberOfKeys * sizeof(ScanKeyData)); + } + + resetSpGistScanOpaque(so); + + PG_RETURN_VOID(); +} + +Datum +spgendscan(PG_FUNCTION_ARGS) +{ + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + + MemoryContextDelete(so->tempCxt); + + PG_RETURN_VOID(); +} + +Datum +spgmarkpos(PG_FUNCTION_ARGS) +{ + elog(ERROR, "SPGiST does not support mark/restore"); + PG_RETURN_VOID(); +} + +Datum +spgrestrpos(PG_FUNCTION_ARGS) +{ + elog(ERROR, "SPGiST does not support mark/restore"); + PG_RETURN_VOID(); +} + +/* + * Test whether a leaf datum satisfies all the scan keys + * + * *recheck is set true if any of the operators are lossy + */ +static bool +spgLeafTest(SpGistScanOpaque so, Datum leafDatum, + int level, Datum reconstructedValue, + bool *recheck) +{ + bool result = true; + spgLeafConsistentIn in; + spgLeafConsistentOut out; + MemoryContext oldCtx; + int i; + + *recheck = false; + + /* set up values that are the same for all quals */ + in.reconstructedValue = reconstructedValue; + in.level = level; + in.leafDatum = leafDatum; + + /* Apply each leaf consistent function, working in the temp context */ + oldCtx = MemoryContextSwitchTo(so->tempCxt); + for (i = 0; i < so->numberOfKeys; i++) + { + in.strategy = so->keyData[i].sk_strategy; + in.query = so->keyData[i].sk_argument; + + out.recheck = false; + + result = DatumGetBool(FunctionCall2Coll(&so->state.leafConsistentFn, + so->keyData[i].sk_collation, + PointerGetDatum(&in), + PointerGetDatum(&out))); + *recheck |= out.recheck; + if (!result) + break; + } + MemoryContextSwitchTo(oldCtx); + + return result; +} + +/* + * Walk the tree and report all tuples passing the scan quals to the storeRes + * subroutine. + * + * If scanWholeIndex is true, we'll do just that. If not, we'll stop at the + * next page boundary once we have reported at least one tuple. + */ +static void +spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex, + void (*storeRes) (SpGistScanOpaque, ItemPointer, bool)) +{ + Buffer buffer = InvalidBuffer; + bool reportedSome = false; + + while (scanWholeIndex || !reportedSome) + { + ScanStackEntry *stackEntry; + BlockNumber blkno; + OffsetNumber offset; + Page page; + + /* Pull next to-do item from the list */ + if (so->scanStack == NIL) + break; /* there are no more pages to scan */ + + stackEntry = (ScanStackEntry *) linitial(so->scanStack); + so->scanStack = list_delete_first(so->scanStack); + +redirect: + /* Check for interrupts, just in case of infinite loop */ + CHECK_FOR_INTERRUPTS(); + + blkno = ItemPointerGetBlockNumber(&stackEntry->ptr); + offset = ItemPointerGetOffsetNumber(&stackEntry->ptr); + + if (buffer == InvalidBuffer) + { + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + } + else if (blkno != BufferGetBlockNumber(buffer)) + { + UnlockReleaseBuffer(buffer); + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + } + /* else new pointer points to the same page, no work needed */ + + page = BufferGetPage(buffer); + + if (SpGistPageIsLeaf(page)) + { + SpGistLeafTuple leafTuple; + OffsetNumber max = PageGetMaxOffsetNumber(page); + bool recheck = false; + + if (blkno == SPGIST_HEAD_BLKNO) + { + /* When root is a leaf, examine all its tuples */ + for (offset = FirstOffsetNumber; offset <= max; offset++) + { + leafTuple = (SpGistLeafTuple) + PageGetItem(page, PageGetItemId(page, offset)); + if (leafTuple->tupstate != SPGIST_LIVE) + { + /* all tuples on root should be live */ + elog(ERROR, "unexpected SPGiST tuple state: %d", + leafTuple->tupstate); + } + + Assert(ItemPointerIsValid(&leafTuple->heapPtr)); + if (spgLeafTest(so, + SGLTDATUM(leafTuple, &so->state), + stackEntry->level, + stackEntry->reconstructedValue, + &recheck)) + { + storeRes(so, &leafTuple->heapPtr, recheck); + reportedSome = true; + } + } + } + else + { + /* Normal case: just examine the chain we arrived at */ + while (offset != InvalidOffsetNumber) + { + Assert(offset >= FirstOffsetNumber && offset <= max); + leafTuple = (SpGistLeafTuple) + PageGetItem(page, PageGetItemId(page, offset)); + if (leafTuple->tupstate != SPGIST_LIVE) + { + if (leafTuple->tupstate == SPGIST_REDIRECT) + { + /* redirection tuple should be first in chain */ + Assert(offset == ItemPointerGetOffsetNumber(&stackEntry->ptr)); + /* transfer attention to redirect point */ + stackEntry->ptr = ((SpGistDeadTuple) leafTuple)->pointer; + Assert(ItemPointerGetBlockNumber(&stackEntry->ptr) != SPGIST_METAPAGE_BLKNO); + goto redirect; + } + if (leafTuple->tupstate == SPGIST_DEAD) + { + /* dead tuple should be first in chain */ + Assert(offset == ItemPointerGetOffsetNumber(&stackEntry->ptr)); + /* No live entries on this page */ + Assert(leafTuple->nextOffset == InvalidOffsetNumber); + break; + } + /* We should not arrive at a placeholder */ + elog(ERROR, "unexpected SPGiST tuple state: %d", + leafTuple->tupstate); + } + + Assert(ItemPointerIsValid(&leafTuple->heapPtr)); + if (spgLeafTest(so, + SGLTDATUM(leafTuple, &so->state), + stackEntry->level, + stackEntry->reconstructedValue, + &recheck)) + { + storeRes(so, &leafTuple->heapPtr, recheck); + reportedSome = true; + } + + offset = leafTuple->nextOffset; + } + } + } + else /* page is inner */ + { + SpGistInnerTuple innerTuple; + SpGistNodeTuple node; + int i; + + innerTuple = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, offset)); + + if (innerTuple->tupstate != SPGIST_LIVE) + { + if (innerTuple->tupstate == SPGIST_REDIRECT) + { + /* transfer attention to redirect point */ + stackEntry->ptr = ((SpGistDeadTuple) innerTuple)->pointer; + Assert(ItemPointerGetBlockNumber(&stackEntry->ptr) != SPGIST_METAPAGE_BLKNO); + goto redirect; + } + elog(ERROR, "unexpected SPGiST tuple state: %d", + innerTuple->tupstate); + } + + if (so->numberOfKeys == 0) + { + /* + * This case cannot happen at the moment, because we don't + * set pg_am.amoptionalkey for SP-GiST. In order for full + * index scans to produce correct answers, we'd need to + * index nulls, which we don't. + */ + Assert(false); + +#ifdef NOT_USED + /* + * A full index scan could be done approximately like this, + * but note that reconstruction of indexed values would be + * impossible unless the API for inner_consistent is changed. + */ + SGITITERATE(innerTuple, i, node) + { + if (ItemPointerIsValid(&node->t_tid)) + { + ScanStackEntry *newEntry = palloc(sizeof(ScanStackEntry)); + + newEntry->ptr = node->t_tid; + newEntry->level = -1; + newEntry->reconstructedValue = (Datum) 0; + so->scanStack = lcons(newEntry, so->scanStack); + } + } +#endif + } + else + { + spgInnerConsistentIn in; + spgInnerConsistentOut out; + SpGistNodeTuple *nodes; + int *andMap; + int *levelAdds; + Datum *reconstructedValues; + int j, + nMatches = 0; + MemoryContext oldCtx; + + /* use temp context for calling inner_consistent */ + oldCtx = MemoryContextSwitchTo(so->tempCxt); + + /* set up values that are the same for all scankeys */ + in.reconstructedValue = stackEntry->reconstructedValue; + in.level = stackEntry->level; + in.allTheSame = innerTuple->allTheSame; + in.hasPrefix = (innerTuple->prefixSize > 0); + in.prefixDatum = SGITDATUM(innerTuple, &so->state); + in.nNodes = innerTuple->nNodes; + in.nodeLabels = spgExtractNodeLabels(&so->state, innerTuple); + + /* collect node pointers */ + nodes = (SpGistNodeTuple *) palloc(sizeof(SpGistNodeTuple) * in.nNodes); + SGITITERATE(innerTuple, i, node) + { + nodes[i] = node; + } + + andMap = (int *) palloc0(sizeof(int) * in.nNodes); + levelAdds = (int *) palloc0(sizeof(int) * in.nNodes); + reconstructedValues = (Datum *) palloc0(sizeof(Datum) * in.nNodes); + + for (j = 0; j < so->numberOfKeys; j++) + { + in.strategy = so->keyData[j].sk_strategy; + in.query = so->keyData[j].sk_argument; + + memset(&out, 0, sizeof(out)); + + FunctionCall2Coll(&so->state.innerConsistentFn, + so->keyData[j].sk_collation, + PointerGetDatum(&in), + PointerGetDatum(&out)); + + /* If allTheSame, they should all or none of 'em match */ + if (innerTuple->allTheSame) + if (out.nNodes != 0 && out.nNodes != in.nNodes) + elog(ERROR, "inconsistent inner_consistent results for allTheSame inner tuple"); + + nMatches = 0; + for (i = 0; i < out.nNodes; i++) + { + int nodeN = out.nodeNumbers[i]; + + andMap[nodeN]++; + if (andMap[nodeN] == j + 1) + nMatches++; + if (out.levelAdds) + levelAdds[nodeN] = out.levelAdds[i]; + if (out.reconstructedValues) + reconstructedValues[nodeN] = out.reconstructedValues[i]; + } + + /* quit as soon as all nodes have failed some qual */ + if (nMatches == 0) + break; + } + + MemoryContextSwitchTo(oldCtx); + + if (nMatches > 0) + { + for (i = 0; i < in.nNodes; i++) + { + if (andMap[i] == so->numberOfKeys && + ItemPointerIsValid(&nodes[i]->t_tid)) + { + ScanStackEntry *newEntry; + + /* Create new work item for this node */ + newEntry = palloc(sizeof(ScanStackEntry)); + newEntry->ptr = nodes[i]->t_tid; + newEntry->level = stackEntry->level + levelAdds[i]; + /* Must copy value out of temp context */ + newEntry->reconstructedValue = + datumCopy(reconstructedValues[i], + so->state.attType.attbyval, + so->state.attType.attlen); + + so->scanStack = lcons(newEntry, so->scanStack); + } + } + } + } + } + + /* done with this scan stack entry */ + freeScanStackEntry(so, stackEntry); + /* clear temp context before proceeding to the next one */ + MemoryContextReset(so->tempCxt); + } + + if (buffer != InvalidBuffer) + UnlockReleaseBuffer(buffer); +} + +/* storeRes subroutine for getbitmap case */ +static void +storeBitmap(SpGistScanOpaque so, ItemPointer heapPtr, bool recheck) +{ + tbm_add_tuples(so->tbm, heapPtr, 1, recheck); + so->ntids++; +} + +Datum +spggetbitmap(PG_FUNCTION_ARGS) +{ + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1); + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + + /* Copy scankey to *so so we don't need to pass it around separately */ + so->numberOfKeys = scan->numberOfKeys; + so->keyData = scan->keyData; + + so->tbm = tbm; + so->ntids = 0; + + spgWalk(scan->indexRelation, so, true, storeBitmap); + + PG_RETURN_INT64(so->ntids); +} + +/* storeRes subroutine for gettuple case */ +static void +storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, bool recheck) +{ + Assert(so->nPtrs < MaxIndexTuplesPerPage); + so->heapPtrs[so->nPtrs] = *heapPtr; + so->recheck[so->nPtrs] = recheck; + so->nPtrs++; +} + +Datum +spggettuple(PG_FUNCTION_ARGS) +{ + IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1); + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + + if (dir != ForwardScanDirection) + elog(ERROR, "SP-GiST only supports forward scan direction"); + + /* Copy scankey to *so so we don't need to pass it around separately */ + so->numberOfKeys = scan->numberOfKeys; + so->keyData = scan->keyData; + + for (;;) + { + if (so->iPtr < so->nPtrs) + { + /* continuing to return tuples from a leaf page */ + scan->xs_ctup.t_self = so->heapPtrs[so->iPtr]; + scan->xs_recheck = so->recheck[so->iPtr]; + so->iPtr++; + PG_RETURN_BOOL(true); + } + + so->iPtr = so->nPtrs = 0; + spgWalk(scan->indexRelation, so, false, storeGettuple); + + if (so->nPtrs == 0) + break; /* must have completed scan */ + } + + PG_RETURN_BOOL(false); +} |