diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2011-10-07 20:13:02 -0400 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2011-10-07 20:14:13 -0400 |
commit | a2822fb9337a21f98ac4ce850bb4145acf47ca27 (patch) | |
tree | c239fe9a32ff0225e906711a76348cee1567f0d8 /src/backend/executor/nodeIndexscan.c | |
parent | caa1054df8408b165e5f66ff25c87b6dd0a0a1e7 (diff) | |
download | postgresql-a2822fb9337a21f98ac4ce850bb4145acf47ca27.tar.gz postgresql-a2822fb9337a21f98ac4ce850bb4145acf47ca27.zip |
Support index-only scans using the visibility map to avoid heap fetches.
When a btree index contains all columns required by the query, and the
visibility map shows that all tuples on a target heap page are
visible-to-all, we don't need to fetch that heap page. This patch depends
on the previous patches that made the visibility map reliable.
There's a fair amount left to do here, notably trying to figure out a less
chintzy way of estimating the cost of an index-only scan, but the core
functionality seems ready to commit.
Robert Haas and Ibrar Ahmed, with some previous work by Heikki Linnakangas.
Diffstat (limited to 'src/backend/executor/nodeIndexscan.c')
-rw-r--r-- | src/backend/executor/nodeIndexscan.c | 123 |
1 files changed, 114 insertions, 9 deletions
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index da25384e860..32ed65797ae 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -26,6 +26,7 @@ #include "access/nbtree.h" #include "access/relscan.h" +#include "access/visibilitymap.h" #include "executor/execdebug.h" #include "executor/nodeIndexscan.h" #include "optimizer/clauses.h" @@ -36,6 +37,7 @@ static TupleTableSlot *IndexNext(IndexScanState *node); +static void IndexStoreHeapTuple(TupleTableSlot *slot, IndexScanDesc scandesc); /* ---------------------------------------------------------------- @@ -54,6 +56,7 @@ IndexNext(IndexScanState *node) IndexScanDesc scandesc; HeapTuple tuple; TupleTableSlot *slot; + ItemPointer tid; /* * extract necessary information from index scan node @@ -73,19 +76,63 @@ IndexNext(IndexScanState *node) slot = node->ss.ss_ScanTupleSlot; /* - * ok, now that we have what we need, fetch the next tuple. + * OK, now that we have what we need, fetch the next TID. */ - while ((tuple = index_getnext(scandesc, direction)) != NULL) + while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { /* - * Store the scanned tuple in the scan tuple slot of the scan state. - * Note: we pass 'false' because tuples returned by amgetnext are - * pointers onto disk pages and must not be pfree()'d. + * Attempt index-only scan, if possible. For this, we need to have + * gotten an index tuple from the AM, and we need the TID to reference + * a heap page on which all tuples are known visible to everybody. + * If that's the case, we don't need to visit the heap page for tuple + * visibility testing, and we don't need any column values that are + * not available from the index. + * + * Note: in the index-only path, we are still holding pin on the + * scan's xs_cbuf, ie, the previously visited heap page. It's not + * clear whether it'd be better to release that pin. */ - ExecStoreTuple(tuple, /* tuple to store */ - slot, /* slot to store in */ - scandesc->xs_cbuf, /* buffer containing tuple */ - false); /* don't pfree */ + if (scandesc->xs_itup != NULL && + visibilitymap_test(scandesc->heapRelation, + ItemPointerGetBlockNumber(tid), + &node->iss_VMBuffer)) + { + /* + * Convert index tuple to look like a heap tuple, and store the + * results in the scan tuple slot. + */ + IndexStoreHeapTuple(slot, scandesc); + } + else + { + /* Index-only approach not possible, so fetch heap tuple. */ + tuple = index_fetch_heap(scandesc); + + /* Tuple might not be visible. */ + if (tuple == NULL) + continue; + + /* + * Only MVCC snapshots are supported here, so there should be no + * need to keep following the HOT chain once a visible entry has + * been found. If we did want to allow that, we'd need to keep + * more state to remember not to call index_getnext_tid next time. + */ + if (scandesc->xs_continue_hot) + elog(ERROR, "unsupported use of non-MVCC snapshot in executor"); + + /* + * Store the scanned tuple in the scan tuple slot of the scan + * state. + * + * Note: we pass 'false' because tuples returned by amgetnext are + * pointers onto disk pages and must not be pfree()'d. + */ + ExecStoreTuple(tuple, /* tuple to store */ + slot, /* slot to store in */ + scandesc->xs_cbuf, /* buffer containing tuple */ + false); /* don't pfree */ + } /* * If the index was lossy, we have to recheck the index quals using @@ -114,6 +161,53 @@ IndexNext(IndexScanState *node) } /* + * IndexStoreHeapTuple + * + * When performing an index-only scan, we build a faux heap tuple + * from the index tuple. Columns not present in the index are set to + * NULL, which is OK because we know they won't be referenced. + * + * The faux tuple is built as a virtual tuple that depends on the + * scandesc's xs_itup, so that must remain valid for as long as we + * need the slot contents. + */ +static void +IndexStoreHeapTuple(TupleTableSlot *slot, IndexScanDesc scandesc) +{ + Form_pg_index indexForm = scandesc->indexRelation->rd_index; + TupleDesc indexDesc = RelationGetDescr(scandesc->indexRelation); + int nindexatts = indexDesc->natts; + int nheapatts = slot->tts_tupleDescriptor->natts; + Datum *values = slot->tts_values; + bool *isnull = slot->tts_isnull; + int i; + + /* We must first set the slot to empty, and mark all columns as null */ + ExecClearTuple(slot); + + memset(isnull, true, nheapatts * sizeof(bool)); + + /* Transpose index tuple into heap tuple. */ + for (i = 0; i < nindexatts; i++) + { + int indexatt = indexForm->indkey.values[i]; + + /* Ignore expression columns, as well as system attributes */ + if (indexatt <= 0) + continue; + + Assert(indexatt <= nheapatts); + + values[indexatt - 1] = index_getattr(scandesc->xs_itup, i + 1, + indexDesc, + &isnull[indexatt - 1]); + } + + /* And now we can mark the slot as holding a virtual tuple. */ + ExecStoreVirtualTuple(slot); +} + +/* * IndexRecheck -- access method routine to recheck a tuple in EvalPlanQual */ static bool @@ -399,6 +493,13 @@ ExecEndIndexScan(IndexScanState *node) indexScanDesc = node->iss_ScanDesc; relation = node->ss.ss_currentRelation; + /* Release VM buffer pin, if any. */ + if (node->iss_VMBuffer != InvalidBuffer) + { + ReleaseBuffer(node->iss_VMBuffer); + node->iss_VMBuffer = InvalidBuffer; + } + /* * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext */ @@ -611,6 +712,10 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) indexstate->iss_NumScanKeys, indexstate->iss_NumOrderByKeys); + /* Prepare for possible index-only scan */ + indexstate->iss_ScanDesc->xs_want_itup = node->indexonly; + indexstate->iss_VMBuffer = InvalidBuffer; + /* * If no run-time keys to calculate, go ahead and pass the scankeys to the * index AM. |