diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2011-10-11 14:20:06 -0400 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2011-10-11 14:21:30 -0400 |
commit | a0185461dd94c8d31d8d55a7f2839b0d2f172ab9 (patch) | |
tree | 3bd68d4e123336bbdefa8fd92372f0af7fb6d64f /src/backend/executor/nodeIndexonlyscan.c | |
parent | fa351d5a0db0672b6f586315720302e493116f27 (diff) | |
download | postgresql-a0185461dd94c8d31d8d55a7f2839b0d2f172ab9.tar.gz postgresql-a0185461dd94c8d31d8d55a7f2839b0d2f172ab9.zip |
Rearrange the implementation of index-only scans.
This commit changes index-only scans so that data is read directly from the
index tuple without first generating a faux heap tuple. The only immediate
benefit is that indexes on system columns (such as OID) can be used in
index-only scans, but this is necessary infrastructure if we are ever to
support index-only scans on expression indexes. The executor is now ready
for that, though the planner still needs substantial work to recognize
the possibility.
To do this, Vars in index-only plan nodes have to refer to index columns
not heap columns. I introduced a new special varno, INDEX_VAR, to mark
such Vars to avoid confusion. (In passing, this commit renames the two
existing special varnos to OUTER_VAR and INNER_VAR.) This allows
ruleutils.c to handle them with logic similar to what we use for subplan
reference Vars.
Since index-only scans are now fundamentally different from regular
indexscans so far as their expression subtrees are concerned, I also chose
to change them to have their own plan node type (and hence, their own
executor source file).
Diffstat (limited to 'src/backend/executor/nodeIndexonlyscan.c')
-rw-r--r-- | src/backend/executor/nodeIndexonlyscan.c | 542 |
1 files changed, 542 insertions, 0 deletions
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c new file mode 100644 index 00000000000..487373b4970 --- /dev/null +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -0,0 +1,542 @@ +/*------------------------------------------------------------------------- + * + * nodeIndexonlyscan.c + * Routines to support index-only scans + * + * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeIndexonlyscan.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * ExecIndexOnlyScan scans an index + * IndexOnlyNext retrieve next tuple + * ExecInitIndexOnlyScan creates and initializes state info. + * ExecReScanIndexOnlyScan rescans the indexed relation. + * ExecEndIndexOnlyScan releases all storage. + * ExecIndexOnlyMarkPos marks scan position. + * ExecIndexOnlyRestrPos restores scan position. + */ +#include "postgres.h" + +#include "access/relscan.h" +#include "access/visibilitymap.h" +#include "catalog/pg_opfamily.h" +#include "catalog/pg_type.h" +#include "executor/execdebug.h" +#include "executor/nodeIndexonlyscan.h" +#include "executor/nodeIndexscan.h" +#include "storage/bufmgr.h" +#include "utils/memutils.h" +#include "utils/rel.h" + + +static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node); +static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, + Relation indexRel); + + +/* ---------------------------------------------------------------- + * IndexOnlyNext + * + * Retrieve a tuple from the IndexOnlyScan node's index. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +IndexOnlyNext(IndexOnlyScanState *node) +{ + EState *estate; + ExprContext *econtext; + ScanDirection direction; + IndexScanDesc scandesc; + HeapTuple tuple; + TupleTableSlot *slot; + ItemPointer tid; + + /* + * extract necessary information from index scan node + */ + estate = node->ss.ps.state; + direction = estate->es_direction; + /* flip direction if this is an overall backward scan */ + if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir)) + { + if (ScanDirectionIsForward(direction)) + direction = BackwardScanDirection; + else if (ScanDirectionIsBackward(direction)) + direction = ForwardScanDirection; + } + scandesc = node->ioss_ScanDesc; + econtext = node->ss.ps.ps_ExprContext; + slot = node->ss.ss_ScanTupleSlot; + + /* + * OK, now that we have what we need, fetch the next tuple. + */ + while ((tid = index_getnext_tid(scandesc, direction)) != NULL) + { + /* + * We can skip the heap fetch if the TID references a heap page on + * which all tuples are known visible to everybody. In any case, + * we'll use the index tuple not the heap tuple as the data source. + */ + if (!visibilitymap_test(scandesc->heapRelation, + ItemPointerGetBlockNumber(tid), + &node->ioss_VMBuffer)) + { + /* + * Rats, we have to visit the heap to check visibility. + */ + tuple = index_fetch_heap(scandesc); + if (tuple == NULL) + continue; /* no visible tuple, try next index entry */ + + /* + * Only MVCC snapshots are supported here, so there should be no + * need to keep following the HOT chain once a visible entry has + * been found. If we did want to allow that, we'd need to keep + * more state to remember not to call index_getnext_tid next time. + */ + if (scandesc->xs_continue_hot) + elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); + + /* + * Note: at this point we are holding a pin on the heap page, as + * recorded in scandesc->xs_cbuf. We could release that pin now, + * but it's not clear whether it's a win to do so. The next index + * entry might require a visit to the same heap page. + */ + } + + /* + * Fill the scan tuple slot with data from the index. + */ + StoreIndexTuple(slot, scandesc->xs_itup, scandesc->indexRelation); + + /* + * If the index was lossy, we have to recheck the index quals. + * (Currently, this can never happen, but we should support the case + * for possible future use, eg with GiST indexes.) + */ + if (scandesc->xs_recheck) + { + econtext->ecxt_scantuple = slot; + ResetExprContext(econtext); + if (!ExecQual(node->indexqual, econtext, false)) + { + /* Fails recheck, so drop it and loop back for another */ + InstrCountFiltered2(node, 1); + continue; + } + } + + return slot; + } + + /* + * if we get here it means the index scan failed so we are at the end of + * the scan.. + */ + return ExecClearTuple(slot); +} + +/* + * StoreIndexTuple + * Fill the slot with data from the index tuple. + * + * At some point this might be generally-useful functionality, but + * right now we don't need it elsewhere. + */ +static void +StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, Relation indexRel) +{ + TupleDesc indexDesc = RelationGetDescr(indexRel); + int nindexatts = indexDesc->natts; + Datum *values = slot->tts_values; + bool *isnull = slot->tts_isnull; + int i; + + /* + * Note: we must use the index relation's tupdesc in index_getattr, + * not the slot's tupdesc, because of index_descriptor_hack(). + */ + Assert(slot->tts_tupleDescriptor->natts == nindexatts); + + ExecClearTuple(slot); + for (i = 0; i < nindexatts; i++) + values[i] = index_getattr(itup, i + 1, indexDesc, &isnull[i]); + ExecStoreVirtualTuple(slot); +} + +/* + * index_descriptor_hack -- ugly kluge to make index's tupdesc OK for slot + * + * This is necessary because, alone among btree opclasses, name_ops uses + * a storage type (cstring) different from its input type. The index + * tuple descriptor will show "cstring", which is correct, but we have to + * expose "name" as the slot datatype or ExecEvalVar will whine. If we + * ever want to have any other cases with a different storage type, we ought + * to think of a cleaner solution than this. + */ +static TupleDesc +index_descriptor_hack(Relation indexRel) +{ + TupleDesc tupdesc = RelationGetDescr(indexRel); + int i; + + /* copy so we can scribble on it safely */ + tupdesc = CreateTupleDescCopy(tupdesc); + + for (i = 0; i < tupdesc->natts; i++) + { + if (indexRel->rd_opfamily[i] == NAME_BTREE_FAM_OID && + tupdesc->attrs[i]->atttypid == CSTRINGOID) + { + tupdesc->attrs[i]->atttypid = NAMEOID; + + /* + * We set attlen to match the type OID just in case anything looks + * at it. Note that this is safe only because StoreIndexTuple + * will insert the data as a virtual tuple, and we don't expect + * anything will try to materialize the scan tuple slot. + */ + tupdesc->attrs[i]->attlen = NAMEDATALEN; + } + } + + return tupdesc; +} + +/* + * IndexOnlyRecheck -- access method routine to recheck a tuple in EvalPlanQual + * + * This can't really happen, since an index can't supply CTID which would + * be necessary data for any potential EvalPlanQual target relation. If it + * did happen, the EPQ code would pass us the wrong data, namely a heap + * tuple not an index tuple. So throw an error. + */ +static bool +IndexOnlyRecheck(IndexOnlyScanState *node, TupleTableSlot *slot) +{ + elog(ERROR, "EvalPlanQual recheck is not supported in index-only scans"); + return false; /* keep compiler quiet */ +} + +/* ---------------------------------------------------------------- + * ExecIndexOnlyScan(node) + * ---------------------------------------------------------------- + */ +TupleTableSlot * +ExecIndexOnlyScan(IndexOnlyScanState *node) +{ + /* + * If we have runtime keys and they've not already been set up, do it now. + */ + if (node->ioss_NumRuntimeKeys != 0 && !node->ioss_RuntimeKeysReady) + ExecReScan((PlanState *) node); + + return ExecScan(&node->ss, + (ExecScanAccessMtd) IndexOnlyNext, + (ExecScanRecheckMtd) IndexOnlyRecheck); +} + +/* ---------------------------------------------------------------- + * ExecReScanIndexOnlyScan(node) + * + * Recalculates the values of any scan keys whose value depends on + * information known at runtime, then rescans the indexed relation. + * + * Updating the scan key was formerly done separately in + * ExecUpdateIndexScanKeys. Integrating it into ReScan makes + * rescans of indices and relations/general streams more uniform. + * ---------------------------------------------------------------- + */ +void +ExecReScanIndexOnlyScan(IndexOnlyScanState *node) +{ + /* + * If we are doing runtime key calculations (ie, any of the index key + * values weren't simple Consts), compute the new key values. But first, + * reset the context so we don't leak memory as each outer tuple is + * scanned. Note this assumes that we will recalculate *all* runtime keys + * on each call. + */ + if (node->ioss_NumRuntimeKeys != 0) + { + ExprContext *econtext = node->ioss_RuntimeContext; + + ResetExprContext(econtext); + ExecIndexEvalRuntimeKeys(econtext, + node->ioss_RuntimeKeys, + node->ioss_NumRuntimeKeys); + } + node->ioss_RuntimeKeysReady = true; + + /* reset index scan */ + index_rescan(node->ioss_ScanDesc, + node->ioss_ScanKeys, node->ioss_NumScanKeys, + node->ioss_OrderByKeys, node->ioss_NumOrderByKeys); + + ExecScanReScan(&node->ss); +} + + +/* ---------------------------------------------------------------- + * ExecEndIndexOnlyScan + * ---------------------------------------------------------------- + */ +void +ExecEndIndexOnlyScan(IndexOnlyScanState *node) +{ + Relation indexRelationDesc; + IndexScanDesc indexScanDesc; + Relation relation; + + /* + * extract information from the node + */ + indexRelationDesc = node->ioss_RelationDesc; + indexScanDesc = node->ioss_ScanDesc; + relation = node->ss.ss_currentRelation; + + /* Release VM buffer pin, if any. */ + if (node->ioss_VMBuffer != InvalidBuffer) + { + ReleaseBuffer(node->ioss_VMBuffer); + node->ioss_VMBuffer = InvalidBuffer; + } + + /* + * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext + */ +#ifdef NOT_USED + ExecFreeExprContext(&node->ss.ps); + if (node->ioss_RuntimeContext) + FreeExprContext(node->ioss_RuntimeContext, true); +#endif + + /* + * clear out tuple table slots + */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* + * close the index relation (no-op if we didn't open it) + */ + if (indexScanDesc) + index_endscan(indexScanDesc); + if (indexRelationDesc) + index_close(indexRelationDesc, NoLock); + + /* + * close the heap relation. + */ + ExecCloseScanRelation(relation); +} + +/* ---------------------------------------------------------------- + * ExecIndexOnlyMarkPos + * ---------------------------------------------------------------- + */ +void +ExecIndexOnlyMarkPos(IndexOnlyScanState *node) +{ + index_markpos(node->ioss_ScanDesc); +} + +/* ---------------------------------------------------------------- + * ExecIndexOnlyRestrPos + * ---------------------------------------------------------------- + */ +void +ExecIndexOnlyRestrPos(IndexOnlyScanState *node) +{ + index_restrpos(node->ioss_ScanDesc); +} + +/* ---------------------------------------------------------------- + * ExecInitIndexOnlyScan + * + * Initializes the index scan's state information, creates + * scan keys, and opens the base and index relations. + * + * Note: index scans have 2 sets of state information because + * we have to keep track of the base relation and the + * index relation. + * ---------------------------------------------------------------- + */ +IndexOnlyScanState * +ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) +{ + IndexOnlyScanState *indexstate; + Relation currentRelation; + bool relistarget; + TupleDesc tupDesc; + + /* + * create state structure + */ + indexstate = makeNode(IndexOnlyScanState); + indexstate->ss.ps.plan = (Plan *) node; + indexstate->ss.ps.state = estate; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &indexstate->ss.ps); + + indexstate->ss.ps.ps_TupFromTlist = false; + + /* + * initialize child expressions + * + * Note: we don't initialize all of the indexorderby expression, only the + * sub-parts corresponding to runtime keys (see below). + */ + indexstate->ss.ps.targetlist = (List *) + ExecInitExpr((Expr *) node->scan.plan.targetlist, + (PlanState *) indexstate); + indexstate->ss.ps.qual = (List *) + ExecInitExpr((Expr *) node->scan.plan.qual, + (PlanState *) indexstate); + indexstate->indexqual = (List *) + ExecInitExpr((Expr *) node->indexqual, + (PlanState *) indexstate); + + /* + * tuple table initialization + */ + ExecInitResultTupleSlot(estate, &indexstate->ss.ps); + ExecInitScanTupleSlot(estate, &indexstate->ss); + + /* + * open the base relation and acquire appropriate lock on it. + */ + currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid); + + indexstate->ss.ss_currentRelation = currentRelation; + indexstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */ + + /* + * Initialize result tuple type. + */ + ExecAssignResultTypeFromTL(&indexstate->ss.ps); + + /* + * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop + * here. This allows an index-advisor plugin to EXPLAIN a plan containing + * references to nonexistent indexes. + */ + if (eflags & EXEC_FLAG_EXPLAIN_ONLY) + return indexstate; + + /* + * Open the index relation. + * + * If the parent table is one of the target relations of the query, then + * InitPlan already opened and write-locked the index, so we can avoid + * taking another lock here. Otherwise we need a normal reader's lock. + */ + relistarget = ExecRelationIsTargetRelation(estate, node->scan.scanrelid); + indexstate->ioss_RelationDesc = index_open(node->indexid, + relistarget ? NoLock : AccessShareLock); + + /* + * Now we can get the scan tuple's type (which is the index's rowtype, + * not the heap's) and initialize result projection info. + */ + tupDesc = index_descriptor_hack(indexstate->ioss_RelationDesc); + ExecAssignScanType(&indexstate->ss, tupDesc); + ExecAssignScanProjectionInfo(&indexstate->ss); + + /* + * Initialize index-specific scan state + */ + indexstate->ioss_RuntimeKeysReady = false; + indexstate->ioss_RuntimeKeys = NULL; + indexstate->ioss_NumRuntimeKeys = 0; + + /* + * build the index scan keys from the index qualification + */ + ExecIndexBuildScanKeys((PlanState *) indexstate, + indexstate->ioss_RelationDesc, + node->indexqual, + false, + &indexstate->ioss_ScanKeys, + &indexstate->ioss_NumScanKeys, + &indexstate->ioss_RuntimeKeys, + &indexstate->ioss_NumRuntimeKeys, + NULL, /* no ArrayKeys */ + NULL); + + /* + * any ORDER BY exprs have to be turned into scankeys in the same way + */ + ExecIndexBuildScanKeys((PlanState *) indexstate, + indexstate->ioss_RelationDesc, + node->indexorderby, + true, + &indexstate->ioss_OrderByKeys, + &indexstate->ioss_NumOrderByKeys, + &indexstate->ioss_RuntimeKeys, + &indexstate->ioss_NumRuntimeKeys, + NULL, /* no ArrayKeys */ + NULL); + + /* + * If we have runtime keys, we need an ExprContext to evaluate them. The + * node's standard context won't do because we want to reset that context + * for every tuple. So, build another context just like the other one... + * -tgl 7/11/00 + */ + if (indexstate->ioss_NumRuntimeKeys != 0) + { + ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext; + + ExecAssignExprContext(estate, &indexstate->ss.ps); + indexstate->ioss_RuntimeContext = indexstate->ss.ps.ps_ExprContext; + indexstate->ss.ps.ps_ExprContext = stdecontext; + } + else + { + indexstate->ioss_RuntimeContext = NULL; + } + + /* + * Initialize scan descriptor. + */ + indexstate->ioss_ScanDesc = index_beginscan(currentRelation, + indexstate->ioss_RelationDesc, + estate->es_snapshot, + indexstate->ioss_NumScanKeys, + indexstate->ioss_NumOrderByKeys); + + /* Set it up for index-only scan */ + indexstate->ioss_ScanDesc->xs_want_itup = true; + indexstate->ioss_VMBuffer = InvalidBuffer; + + /* + * If no run-time keys to calculate, go ahead and pass the scankeys to the + * index AM. + */ + if (indexstate->ioss_NumRuntimeKeys == 0) + index_rescan(indexstate->ioss_ScanDesc, + indexstate->ioss_ScanKeys, + indexstate->ioss_NumScanKeys, + indexstate->ioss_OrderByKeys, + indexstate->ioss_NumOrderByKeys); + + /* + * all done. + */ + return indexstate; +} |