Rearrange the implementation of index-only scans.

This commit changes index-only scans so that data is read directly from the index tuple without first generating a faux heap tuple. The only immediate benefit is that indexes on system columns (such as OID) can be used in index-only scans, but this is necessary infrastructure if we are ever to support index-only scans on expression indexes. The executor is now ready for that, though the planner still needs substantial work to recognize the possibility. To do this, Vars in index-only plan nodes have to refer to index columns not heap columns. I introduced a new special varno, INDEX_VAR, to mark such Vars to avoid confusion. (In passing, this commit renames the two existing special varnos to OUTER_VAR and INNER_VAR.) This allows ruleutils.c to handle them with logic similar to what we use for subplan reference Vars. Since index-only scans are now fundamentally different from regular indexscans so far as their expression subtrees are concerned, I also chose to change them to have their own plan node type (and hence, their own executor source file).
author: Tom Lane <tgl@sss.pgh.pa.us> 2011-10-11 14:20:06 -0400
committer: Tom Lane <tgl@sss.pgh.pa.us> 2011-10-11 14:21:30 -0400
commit: a0185461dd94c8d31d8d55a7f2839b0d2f172ab9 (patch)
tree: 3bd68d4e123336bbdefa8fd92372f0af7fb6d64f /src/backend/executor/nodeIndexonlyscan.c
parent: fa351d5a0db0672b6f586315720302e493116f27 (diff)
download: postgresql-a0185461dd94c8d31d8d55a7f2839b0d2f172ab9.tar.gz
postgresql-a0185461dd94c8d31d8d55a7f2839b0d2f172ab9.zip
1 files changed, 542 insertions, 0 deletions
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
new file mode 100644
index 00000000000..487373b4970
--- /dev/null
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -0,0 +1,542 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeIndexonlyscan.c
+ *	  Routines to support index-only scans
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/nodeIndexonlyscan.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *		ExecIndexOnlyScan			scans an index
+ *		IndexOnlyNext				retrieve next tuple
+ *		ExecInitIndexOnlyScan		creates and initializes state info.
+ *		ExecReScanIndexOnlyScan		rescans the indexed relation.
+ *		ExecEndIndexOnlyScan		releases all storage.
+ *		ExecIndexOnlyMarkPos		marks scan position.
+ *		ExecIndexOnlyRestrPos		restores scan position.
+ */
+#include "postgres.h"
+
+#include "access/relscan.h"
+#include "access/visibilitymap.h"
+#include "catalog/pg_opfamily.h"
+#include "catalog/pg_type.h"
+#include "executor/execdebug.h"
+#include "executor/nodeIndexonlyscan.h"
+#include "executor/nodeIndexscan.h"
+#include "storage/bufmgr.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+
+static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
+static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup,
+							Relation indexRel);
+
+
+/* ----------------------------------------------------------------
+ *		IndexOnlyNext
+ *
+ *		Retrieve a tuple from the IndexOnlyScan node's index.
+ * ----------------------------------------------------------------
+ */
+static TupleTableSlot *
+IndexOnlyNext(IndexOnlyScanState *node)
+{
+	EState	   *estate;
+	ExprContext *econtext;
+	ScanDirection direction;
+	IndexScanDesc scandesc;
+	HeapTuple	tuple;
+	TupleTableSlot *slot;
+	ItemPointer tid;
+
+	/*
+	 * extract necessary information from index scan node
+	 */
+	estate = node->ss.ps.state;
+	direction = estate->es_direction;
+	/* flip direction if this is an overall backward scan */
+	if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir))
+	{
+		if (ScanDirectionIsForward(direction))
+			direction = BackwardScanDirection;
+		else if (ScanDirectionIsBackward(direction))
+			direction = ForwardScanDirection;
+	}
+	scandesc = node->ioss_ScanDesc;
+	econtext = node->ss.ps.ps_ExprContext;
+	slot = node->ss.ss_ScanTupleSlot;
+
+	/*
+	 * OK, now that we have what we need, fetch the next tuple.
+	 */
+	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
+	{
+		/*
+		 * We can skip the heap fetch if the TID references a heap page on
+		 * which all tuples are known visible to everybody.  In any case,
+		 * we'll use the index tuple not the heap tuple as the data source.
+		 */
+		if (!visibilitymap_test(scandesc->heapRelation,
+								ItemPointerGetBlockNumber(tid),
+								&node->ioss_VMBuffer))
+		{
+			/*
+			 * Rats, we have to visit the heap to check visibility.
+			 */
+			tuple = index_fetch_heap(scandesc);
+			if (tuple == NULL)
+				continue;	/* no visible tuple, try next index entry */
+
+			/*
+			 * Only MVCC snapshots are supported here, so there should be no
+			 * need to keep following the HOT chain once a visible entry has
+			 * been found.  If we did want to allow that, we'd need to keep
+			 * more state to remember not to call index_getnext_tid next time.
+			 */
+			if (scandesc->xs_continue_hot)
+				elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+
+			/*
+			 * Note: at this point we are holding a pin on the heap page, as
+			 * recorded in scandesc->xs_cbuf.  We could release that pin now,
+			 * but it's not clear whether it's a win to do so.  The next index
+			 * entry might require a visit to the same heap page.
+			 */
+		}
+
+		/*
+		 * Fill the scan tuple slot with data from the index.
+		 */
+		StoreIndexTuple(slot, scandesc->xs_itup, scandesc->indexRelation);
+
+		/*
+		 * If the index was lossy, we have to recheck the index quals.
+		 * (Currently, this can never happen, but we should support the case
+		 * for possible future use, eg with GiST indexes.)
+		 */
+		if (scandesc->xs_recheck)
+		{
+			econtext->ecxt_scantuple = slot;
+			ResetExprContext(econtext);
+			if (!ExecQual(node->indexqual, econtext, false))
+			{
+				/* Fails recheck, so drop it and loop back for another */
+				InstrCountFiltered2(node, 1);
+				continue;
+			}
+		}
+
+		return slot;
+	}
+
+	/*
+	 * if we get here it means the index scan failed so we are at the end of
+	 * the scan..
+	 */
+	return ExecClearTuple(slot);
+}
+
+/*
+ * StoreIndexTuple
+ *		Fill the slot with data from the index tuple.
+ *
+ * At some point this might be generally-useful functionality, but
+ * right now we don't need it elsewhere.
+ */
+static void
+StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, Relation indexRel)
+{
+	TupleDesc	indexDesc = RelationGetDescr(indexRel);
+	int			nindexatts = indexDesc->natts;
+	Datum	   *values = slot->tts_values;
+	bool	   *isnull = slot->tts_isnull;
+	int			i;
+
+	/*
+	 * Note: we must use the index relation's tupdesc in index_getattr,
+	 * not the slot's tupdesc, because of index_descriptor_hack().
+	 */
+	Assert(slot->tts_tupleDescriptor->natts == nindexatts);
+
+	ExecClearTuple(slot);
+	for (i = 0; i < nindexatts; i++)
+		values[i] = index_getattr(itup, i + 1, indexDesc, &isnull[i]);
+	ExecStoreVirtualTuple(slot);
+}
+
+/*
+ * index_descriptor_hack -- ugly kluge to make index's tupdesc OK for slot
+ *
+ * This is necessary because, alone among btree opclasses, name_ops uses
+ * a storage type (cstring) different from its input type.  The index
+ * tuple descriptor will show "cstring", which is correct, but we have to
+ * expose "name" as the slot datatype or ExecEvalVar will whine.  If we
+ * ever want to have any other cases with a different storage type, we ought
+ * to think of a cleaner solution than this.
+ */
+static TupleDesc
+index_descriptor_hack(Relation indexRel)
+{
+	TupleDesc	tupdesc = RelationGetDescr(indexRel);
+	int			i;
+
+	/* copy so we can scribble on it safely */
+	tupdesc = CreateTupleDescCopy(tupdesc);
+
+	for (i = 0; i < tupdesc->natts; i++)
+	{
+		if (indexRel->rd_opfamily[i] == NAME_BTREE_FAM_OID &&
+			tupdesc->attrs[i]->atttypid == CSTRINGOID)
+		{
+			tupdesc->attrs[i]->atttypid = NAMEOID;
+
+			/*
+			 * We set attlen to match the type OID just in case anything looks
+			 * at it.  Note that this is safe only because StoreIndexTuple
+			 * will insert the data as a virtual tuple, and we don't expect
+			 * anything will try to materialize the scan tuple slot.
+			 */
+			tupdesc->attrs[i]->attlen = NAMEDATALEN;
+		}
+	}
+
+	return tupdesc;
+}
+
+/*
+ * IndexOnlyRecheck -- access method routine to recheck a tuple in EvalPlanQual
+ *
+ * This can't really happen, since an index can't supply CTID which would
+ * be necessary data for any potential EvalPlanQual target relation.  If it
+ * did happen, the EPQ code would pass us the wrong data, namely a heap
+ * tuple not an index tuple.  So throw an error.
+ */
+static bool
+IndexOnlyRecheck(IndexOnlyScanState *node, TupleTableSlot *slot)
+{
+	elog(ERROR, "EvalPlanQual recheck is not supported in index-only scans");
+	return false;				/* keep compiler quiet */
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexOnlyScan(node)
+ * ----------------------------------------------------------------
+ */
+TupleTableSlot *
+ExecIndexOnlyScan(IndexOnlyScanState *node)
+{
+	/*
+	 * If we have runtime keys and they've not already been set up, do it now.
+	 */
+	if (node->ioss_NumRuntimeKeys != 0 && !node->ioss_RuntimeKeysReady)
+		ExecReScan((PlanState *) node);
+
+	return ExecScan(&node->ss,
+					(ExecScanAccessMtd) IndexOnlyNext,
+					(ExecScanRecheckMtd) IndexOnlyRecheck);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecReScanIndexOnlyScan(node)
+ *
+ *		Recalculates the values of any scan keys whose value depends on
+ *		information known at runtime, then rescans the indexed relation.
+ *
+ *		Updating the scan key was formerly done separately in
+ *		ExecUpdateIndexScanKeys. Integrating it into ReScan makes
+ *		rescans of indices and relations/general streams more uniform.
+ * ----------------------------------------------------------------
+ */
+void
+ExecReScanIndexOnlyScan(IndexOnlyScanState *node)
+{
+	/*
+	 * If we are doing runtime key calculations (ie, any of the index key
+	 * values weren't simple Consts), compute the new key values.  But first,
+	 * reset the context so we don't leak memory as each outer tuple is
+	 * scanned.  Note this assumes that we will recalculate *all* runtime keys
+	 * on each call.
+	 */
+	if (node->ioss_NumRuntimeKeys != 0)
+	{
+		ExprContext *econtext = node->ioss_RuntimeContext;
+
+		ResetExprContext(econtext);
+		ExecIndexEvalRuntimeKeys(econtext,
+								 node->ioss_RuntimeKeys,
+								 node->ioss_NumRuntimeKeys);
+	}
+	node->ioss_RuntimeKeysReady = true;
+
+	/* reset index scan */
+	index_rescan(node->ioss_ScanDesc,
+				 node->ioss_ScanKeys, node->ioss_NumScanKeys,
+				 node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
+
+	ExecScanReScan(&node->ss);
+}
+
+
+/* ----------------------------------------------------------------
+ *		ExecEndIndexOnlyScan
+ * ----------------------------------------------------------------
+ */
+void
+ExecEndIndexOnlyScan(IndexOnlyScanState *node)
+{
+	Relation	indexRelationDesc;
+	IndexScanDesc indexScanDesc;
+	Relation	relation;
+
+	/*
+	 * extract information from the node
+	 */
+	indexRelationDesc = node->ioss_RelationDesc;
+	indexScanDesc = node->ioss_ScanDesc;
+	relation = node->ss.ss_currentRelation;
+
+	/* Release VM buffer pin, if any. */
+	if (node->ioss_VMBuffer != InvalidBuffer)
+	{
+		ReleaseBuffer(node->ioss_VMBuffer);
+		node->ioss_VMBuffer = InvalidBuffer;
+	}
+
+	/*
+	 * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext
+	 */
+#ifdef NOT_USED
+	ExecFreeExprContext(&node->ss.ps);
+	if (node->ioss_RuntimeContext)
+		FreeExprContext(node->ioss_RuntimeContext, true);
+#endif
+
+	/*
+	 * clear out tuple table slots
+	 */
+	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	/*
+	 * close the index relation (no-op if we didn't open it)
+	 */
+	if (indexScanDesc)
+		index_endscan(indexScanDesc);
+	if (indexRelationDesc)
+		index_close(indexRelationDesc, NoLock);
+
+	/*
+	 * close the heap relation.
+	 */
+	ExecCloseScanRelation(relation);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexOnlyMarkPos
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexOnlyMarkPos(IndexOnlyScanState *node)
+{
+	index_markpos(node->ioss_ScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecIndexOnlyRestrPos
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexOnlyRestrPos(IndexOnlyScanState *node)
+{
+	index_restrpos(node->ioss_ScanDesc);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecInitIndexOnlyScan
+ *
+ *		Initializes the index scan's state information, creates
+ *		scan keys, and opens the base and index relations.
+ *
+ *		Note: index scans have 2 sets of state information because
+ *			  we have to keep track of the base relation and the
+ *			  index relation.
+ * ----------------------------------------------------------------
+ */
+IndexOnlyScanState *
+ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
+{
+	IndexOnlyScanState *indexstate;
+	Relation	currentRelation;
+	bool		relistarget;
+	TupleDesc	tupDesc;
+
+	/*
+	 * create state structure
+	 */
+	indexstate = makeNode(IndexOnlyScanState);
+	indexstate->ss.ps.plan = (Plan *) node;
+	indexstate->ss.ps.state = estate;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &indexstate->ss.ps);
+
+	indexstate->ss.ps.ps_TupFromTlist = false;
+
+	/*
+	 * initialize child expressions
+	 *
+	 * Note: we don't initialize all of the indexorderby expression, only the
+	 * sub-parts corresponding to runtime keys (see below).
+	 */
+	indexstate->ss.ps.targetlist = (List *)
+		ExecInitExpr((Expr *) node->scan.plan.targetlist,
+					 (PlanState *) indexstate);
+	indexstate->ss.ps.qual = (List *)
+		ExecInitExpr((Expr *) node->scan.plan.qual,
+					 (PlanState *) indexstate);
+	indexstate->indexqual = (List *)
+		ExecInitExpr((Expr *) node->indexqual,
+					 (PlanState *) indexstate);
+
+	/*
+	 * tuple table initialization
+	 */
+	ExecInitResultTupleSlot(estate, &indexstate->ss.ps);
+	ExecInitScanTupleSlot(estate, &indexstate->ss);
+
+	/*
+	 * open the base relation and acquire appropriate lock on it.
+	 */
+	currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid);
+
+	indexstate->ss.ss_currentRelation = currentRelation;
+	indexstate->ss.ss_currentScanDesc = NULL;	/* no heap scan here */
+
+	/*
+	 * Initialize result tuple type.
+	 */
+	ExecAssignResultTypeFromTL(&indexstate->ss.ps);
+
+	/*
+	 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
+	 * here.  This allows an index-advisor plugin to EXPLAIN a plan containing
+	 * references to nonexistent indexes.
+	 */
+	if (eflags & EXEC_FLAG_EXPLAIN_ONLY)
+		return indexstate;
+
+	/*
+	 * Open the index relation.
+	 *
+	 * If the parent table is one of the target relations of the query, then
+	 * InitPlan already opened and write-locked the index, so we can avoid
+	 * taking another lock here.  Otherwise we need a normal reader's lock.
+	 */
+	relistarget = ExecRelationIsTargetRelation(estate, node->scan.scanrelid);
+	indexstate->ioss_RelationDesc = index_open(node->indexid,
+									 relistarget ? NoLock : AccessShareLock);
+
+	/*
+	 * Now we can get the scan tuple's type (which is the index's rowtype,
+	 * not the heap's) and initialize result projection info.
+	 */
+	tupDesc = index_descriptor_hack(indexstate->ioss_RelationDesc);
+	ExecAssignScanType(&indexstate->ss, tupDesc);
+	ExecAssignScanProjectionInfo(&indexstate->ss);
+
+	/*
+	 * Initialize index-specific scan state
+	 */
+	indexstate->ioss_RuntimeKeysReady = false;
+	indexstate->ioss_RuntimeKeys = NULL;
+	indexstate->ioss_NumRuntimeKeys = 0;
+
+	/*
+	 * build the index scan keys from the index qualification
+	 */
+	ExecIndexBuildScanKeys((PlanState *) indexstate,
+						   indexstate->ioss_RelationDesc,
+						   node->indexqual,
+						   false,
+						   &indexstate->ioss_ScanKeys,
+						   &indexstate->ioss_NumScanKeys,
+						   &indexstate->ioss_RuntimeKeys,
+						   &indexstate->ioss_NumRuntimeKeys,
+						   NULL,	/* no ArrayKeys */
+						   NULL);
+
+	/*
+	 * any ORDER BY exprs have to be turned into scankeys in the same way
+	 */
+	ExecIndexBuildScanKeys((PlanState *) indexstate,
+						   indexstate->ioss_RelationDesc,
+						   node->indexorderby,
+						   true,
+						   &indexstate->ioss_OrderByKeys,
+						   &indexstate->ioss_NumOrderByKeys,
+						   &indexstate->ioss_RuntimeKeys,
+						   &indexstate->ioss_NumRuntimeKeys,
+						   NULL,	/* no ArrayKeys */
+						   NULL);
+
+	/*
+	 * If we have runtime keys, we need an ExprContext to evaluate them. The
+	 * node's standard context won't do because we want to reset that context
+	 * for every tuple.  So, build another context just like the other one...
+	 * -tgl 7/11/00
+	 */
+	if (indexstate->ioss_NumRuntimeKeys != 0)
+	{
+		ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext;
+
+		ExecAssignExprContext(estate, &indexstate->ss.ps);
+		indexstate->ioss_RuntimeContext = indexstate->ss.ps.ps_ExprContext;
+		indexstate->ss.ps.ps_ExprContext = stdecontext;
+	}
+	else
+	{
+		indexstate->ioss_RuntimeContext = NULL;
+	}
+
+	/*
+	 * Initialize scan descriptor.
+	 */
+	indexstate->ioss_ScanDesc = index_beginscan(currentRelation,
+											   indexstate->ioss_RelationDesc,
+											   estate->es_snapshot,
+											   indexstate->ioss_NumScanKeys,
+											 indexstate->ioss_NumOrderByKeys);
+
+	/* Set it up for index-only scan */
+	indexstate->ioss_ScanDesc->xs_want_itup = true;
+	indexstate->ioss_VMBuffer = InvalidBuffer;
+
+	/*
+	 * If no run-time keys to calculate, go ahead and pass the scankeys to the
+	 * index AM.
+	 */
+	if (indexstate->ioss_NumRuntimeKeys == 0)
+		index_rescan(indexstate->ioss_ScanDesc,
+					 indexstate->ioss_ScanKeys,
+					 indexstate->ioss_NumScanKeys,
+					 indexstate->ioss_OrderByKeys,
+					 indexstate->ioss_NumOrderByKeys);
+
+	/*
+	 * all done.
+	 */
+	return indexstate;
+}
author	Tom Lane <tgl@sss.pgh.pa.us>	2011-10-11 14:20:06 -0400
committer	Tom Lane <tgl@sss.pgh.pa.us>	2011-10-11 14:21:30 -0400
commit	a0185461dd94c8d31d8d55a7f2839b0d2f172ab9 (patch)
tree	3bd68d4e123336bbdefa8fd92372f0af7fb6d64f /src/backend/executor/nodeIndexonlyscan.c
parent	fa351d5a0db0672b6f586315720302e493116f27 (diff)
download	postgresql-a0185461dd94c8d31d8d55a7f2839b0d2f172ab9.tar.gz postgresql-a0185461dd94c8d31d8d55a7f2839b0d2f172ab9.zip