Improve index-only scans to avoid repeated access to the index page.

We copy all the matched tuples off the page during _bt_readpage, instead of expensively re-locking the page during each subsequent tuple fetch. This costs a bit more local storage, but not more than 2*BLCKSZ worth, and the reduction in LWLock traffic is certainly worth that. What's more, this lets us get rid of the API wart in the original patch that said an index AM could randomly decline to supply an index tuple despite having asserted pg_am.amcanreturn. That will be important for future improvements in the index-only-scan feature, since the executor will now be able to rely on having the index data available.
author: Tom Lane <tgl@sss.pgh.pa.us> 2011-10-09 00:21:08 -0400
committer: Tom Lane <tgl@sss.pgh.pa.us> 2011-10-09 00:21:08 -0400
commit: cbfa92c23c3924d53889320cdbe26f23ee23e40c (patch)
tree: f93756a2e9f1d7e6cbf468b16528f275c04f04e5 /src/include/access/nbtree.h
parent: 45401c1c25fe1ef14bf68089de86bcb5cce9f453 (diff)
download: postgresql-cbfa92c23c3924d53889320cdbe26f23ee23e40c.tar.gz
postgresql-cbfa92c23c3924d53889320cdbe26f23ee23e40c.zip
1 files changed, 21 insertions, 1 deletions
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 4e20c79ca6e..199fc940267 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -472,12 +472,18 @@ typedef BTStackData *BTStack;
  * items were killed, we re-lock the page to mark them killed, then unlock.
  * Finally we drop the pin and step to the next page in the appropriate
  * direction.
+ *
+ * If we are doing an index-only scan, we save the entire IndexTuple for each
+ * matched item, otherwise only its heap TID and offset.  The IndexTuples go
+ * into a separate workspace array; each BTScanPosItem stores its tuple's
+ * offset within that array.
  */
 
 typedef struct BTScanPosItem	/* what we remember about each match */
 {
 	ItemPointerData heapTid;	/* TID of referenced heap item */
 	OffsetNumber indexOffset;	/* index item's location within page */
+	LocationIndex tupleOffset;	/* IndexTuple's offset in workspace, if any */
 } BTScanPosItem;
 
 typedef struct BTScanPosData
@@ -496,6 +502,12 @@ typedef struct BTScanPosData
 	bool		moreRight;
 
 	/*
+	 * If we are doing an index-only scan, nextTupleOffset is the first free
+	 * location in the associated tuple storage workspace.
+	 */
+	int			nextTupleOffset;
+
+	/*
 	 * The items array is always ordered in index order (ie, increasing
 	 * indexoffset).  When scanning backwards it is convenient to fill the
 	 * array back-to-front, so we start at the last slot and fill downwards.
@@ -525,6 +537,14 @@ typedef struct BTScanOpaqueData
 	int			numKilled;		/* number of currently stored items */
 
 	/*
+	 * If we are doing an index-only scan, these are the tuple storage
+	 * workspaces for the currPos and markPos respectively.  Each is of
+	 * size BLCKSZ, so it can hold as much as a full page's worth of tuples.
+	 */
+	char	   *currTuples;		/* tuple storage for currPos */
+	char	   *markTuples;		/* tuple storage for markPos */
+
+	/*
 	 * If the marked position is on the same page as current position, we
 	 * don't use markPos, but just keep the marked itemIndex in markItemIndex
 	 * (all the rest of currPos is valid for the mark position). Hence, to
@@ -620,7 +640,7 @@ extern ScanKey _bt_mkscankey_nodata(Relation rel);
 extern void _bt_freeskey(ScanKey skey);
 extern void _bt_freestack(BTStack stack);
 extern void _bt_preprocess_keys(IndexScanDesc scan);
-extern bool _bt_checkkeys(IndexScanDesc scan,
+extern IndexTuple _bt_checkkeys(IndexScanDesc scan,
 			  Page page, OffsetNumber offnum,
 			  ScanDirection dir, bool *continuescan);
 extern void _bt_killitems(IndexScanDesc scan, bool haveLock);
author	Tom Lane <tgl@sss.pgh.pa.us>	2011-10-09 00:21:08 -0400
committer	Tom Lane <tgl@sss.pgh.pa.us>	2011-10-09 00:21:08 -0400
commit	cbfa92c23c3924d53889320cdbe26f23ee23e40c (patch)
tree	f93756a2e9f1d7e6cbf468b16528f275c04f04e5 /src/include/access/nbtree.h
parent	45401c1c25fe1ef14bf68089de86bcb5cce9f453 (diff)
download	postgresql-cbfa92c23c3924d53889320cdbe26f23ee23e40c.tar.gz postgresql-cbfa92c23c3924d53889320cdbe26f23ee23e40c.zip