4 files changed, 59 insertions, 9 deletions
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 7055c242d20..a3f11da8d5a 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -520,6 +520,28 @@ normal running after recovery has completed. This is a key capability
 because it allows running applications to continue while the standby
 changes state into a normally running server.
 
+The interlocking required to avoid returning incorrect results from
+non-MVCC scans is not required on standby nodes. That is because
+HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesSelf(),
+HeapTupleSatisfiesDirty() and HeapTupleSatisfiesVacuum() are only
+ever used during write transactions, which cannot exist on the standby.
+MVCC scans are already protected by definition, so HeapTupleSatisfiesMVCC()
+is not a problem.  That leaves concern only for HeapTupleSatisfiesToast().
+HeapTupleSatisfiesToast() doesn't use MVCC semantics, though that's
+because it doesn't need to - if the main heap row is visible then the
+toast rows will also be visible. So as long as we follow a toast
+pointer from a visible (live) tuple the corresponding toast rows
+will also be visible, so we do not need to recheck MVCC on them.
+There is one minor exception, which is that the optimizer sometimes
+looks at the boundaries of value ranges using SnapshotDirty, which
+could result in returning a newer value for query statistics; this
+would affect the query plan in rare cases, but not the correctness.
+The risk window is small since the stats look at the min and max values
+in the index, so the scan retrieves a tid then immediately uses it
+to look in the heap. It is unlikely that the tid could have been
+deleted, vacuumed and re-inserted in the time taken to look in the heap
+via direct tid access. So we ignore that scan type as a problem.
+
 Other Things That Are Handy to Know
 -----------------------------------
 
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index cd2d4a6c54e..fc31fe4ad9f 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -804,6 +804,10 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	}
 
 	/*
+	 * Check to see if we need to issue one final WAL record for this index,
+	 * which may be needed for correctness on a hot standby node when non-MVCC
+	 * index scans could take place.
+	 *
 	 * If the WAL is replayed in hot standby, the replay process needs to get
 	 * cleanup locks on all index leaf pages, just as we've been doing here.
 	 * However, we won't issue any WAL records about pages that have no items
@@ -1013,10 +1017,13 @@ restart:
 		if (ndeletable > 0)
 		{
 			/*
-			 * Notice that the issued XLOG_BTREE_VACUUM WAL record includes an
-			 * instruction to the replay code to get cleanup lock on all pages
-			 * between the previous lastBlockVacuumed and this page.  This
-			 * ensures that WAL replay locks all leaf pages at some point.
+			 * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
+			 * all information to the replay code to allow it to get a cleanup
+			 * lock on all pages between the previous lastBlockVacuumed and
+			 * this page. This ensures that WAL replay locks all leaf pages at
+			 * some point, which is important should non-MVCC scans be
+			 * requested. This is currently unused on standby, but we record
+			 * it anyway, so that the WAL contains the required information.
 			 *
 			 * Since we can visit leaf pages out-of-order when recursing,
 			 * replay might end up locking such pages an extra time, but it
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index da28e21d5cb..d569ae1cd66 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -385,12 +385,29 @@ static void
 btree_xlog_vacuum(XLogReaderState *record)
 {
 	XLogRecPtr	lsn = record->EndRecPtr;
-	xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
 	Buffer		buffer;
 	Page		page;
 	BTPageOpaque opaque;
+#ifdef UNUSED
+	xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
 
 	/*
+	 * This section of code is thought to be no longer needed, after analysis
+	 * of the calling paths. It is retained to allow the code to be reinstated
+	 * if a flaw is revealed in that thinking.
+	 *
+	 * If we are running non-MVCC scans using this index we need to do some
+	 * additional work to ensure correctness, which is known as a "pin scan"
+	 * described in more detail in next paragraphs. We used to do the extra
+	 * work in all cases, whereas we now avoid that work in most cases. If
+	 * lastBlockVacuumed is set to InvalidBlockNumber then we skip the
+	 * additional work required for the pin scan.
+	 *
+	 * Avoiding this extra work is important since it requires us to touch
+	 * every page in the index, so is an O(N) operation. Worse, it is an
+	 * operation performed in the foreground during redo, so it delays
+	 * replication directly.
+	 *
 	 * If queries might be active then we need to ensure every leaf page is
 	 * unpinned between the lastBlockVacuumed and the current block, if there
 	 * are any.  This prevents replay of the VACUUM from reaching the stage of
@@ -412,7 +429,7 @@ btree_xlog_vacuum(XLogReaderState *record)
 	 * isn't yet consistent; so we need not fear reading still-corrupt blocks
 	 * here during crash recovery.
 	 */
-	if (HotStandbyActiveInReplay())
+	if (HotStandbyActiveInReplay() && BlockNumberIsValid(xlrec->lastBlockVacuumed))
 	{
 		RelFileNode thisrnode;
 		BlockNumber thisblkno;
@@ -433,7 +450,8 @@ btree_xlog_vacuum(XLogReaderState *record)
 			 * XXX we don't actually need to read the block, we just need to
 			 * confirm it is unpinned. If we had a special call into the
 			 * buffer manager we could optimise this so that if the block is
-			 * not in shared_buffers we confirm it as unpinned.
+			 * not in shared_buffers we confirm it as unpinned. Optimizing
+			 * this is now moot, since in most cases we avoid the scan.
 			 */
 			buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno,
 											RBM_NORMAL_NO_LOG);
@@ -444,6 +462,7 @@ btree_xlog_vacuum(XLogReaderState *record)
 			}
 		}
 	}
+#endif
 
 	/*
 	 * Like in btvacuumpage(), we need to take a cleanup lock on every leaf
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 1b0c649c91d..42928d49f9b 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -331,8 +331,10 @@ typedef struct xl_btree_reuse_page
  * The WAL record can represent deletion of any number of index tuples on a
  * single index page when executed by VACUUM.
  *
- * The correctness requirement for applying these changes during recovery is
- * that we must do one of these two things for every block in the index:
+ * For MVCC scans, lastBlockVacuumed will be set to InvalidBlockNumber.
+ * For a non-MVCC index scans there is an additional correctness requirement
+ * for applying these changes during recovery, which is that we must do one
+ * of these two things for every block in the index:
  *		* lock the block for cleanup and apply any required changes
  *		* EnsureBlockUnpinned()
  * The purpose of this is to ensure that no index scans started before we