28 files changed, 1320 insertions, 926 deletions
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 01e1db7f856..4204088fa0d 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -68,7 +68,7 @@ typedef struct BrinShared
 	int			scantuplesortstates;
 
 	/* Query ID, for report in worker processes */
-	uint64		queryid;
+	int64		queryid;
 
 	/*
 	 * workersdonecv is used to monitor the progress of workers.  All parallel
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 46c1dce222d..50747c16396 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -1243,8 +1243,9 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace,
 		}
 		else
 		{
-			text	   *t;
+			const char *name;
 			const char *value;
+			text	   *t;
 			Size		len;
 
 			/*
@@ -1291,11 +1292,19 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace,
 			 * have just "name", assume "name=true" is meant.  Note: the
 			 * namespace is not output.
 			 */
+			name = def->defname;
 			if (def->arg != NULL)
 				value = defGetString(def);
 			else
 				value = "true";
 
+			/* Insist that name not contain "=", else "a=b=c" is ambiguous */
+			if (strchr(name, '=') != NULL)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("invalid option name \"%s\": must not contain \"=\"",
+								name)));
+
 			/*
 			 * This is not a great place for this test, but there's no other
 			 * convenient place to filter the option out. As WITH (oids =
@@ -1303,7 +1312,7 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace,
 			 * amount of ugly.
 			 */
 			if (acceptOidsOff && def->defnamespace == NULL &&
-				strcmp(def->defname, "oids") == 0)
+				strcmp(name, "oids") == 0)
 			{
 				if (defGetBoolean(def))
 					ereport(ERROR,
@@ -1313,11 +1322,11 @@ transformRelOptions(Datum oldOptions, List *defList, const char *namspace,
 				continue;
 			}
 
-			len = VARHDRSZ + strlen(def->defname) + 1 + strlen(value);
+			len = VARHDRSZ + strlen(name) + 1 + strlen(value);
 			/* +1 leaves room for sprintf's trailing null */
 			t = (text *) palloc(len + 1);
 			SET_VARSIZE(t, len);
-			sprintf(VARDATA(t), "%s=%s", def->defname, value);
+			sprintf(VARDATA(t), "%s=%s", name, value);
 
 			astate = accumArrayResult(astate, PointerGetDatum(t),
 									  false, TEXTOID,
diff --git a/src/backend/access/common/tupdesc.c b/src/backend/access/common/tupdesc.c
index ffd0c78f905..020d00cd01c 100644
--- a/src/backend/access/common/tupdesc.c
+++ b/src/backend/access/common/tupdesc.c
@@ -142,11 +142,18 @@ void
 verify_compact_attribute(TupleDesc tupdesc, int attnum)
 {
 #ifdef USE_ASSERT_CHECKING
-	CompactAttribute *cattr = &tupdesc->compact_attrs[attnum];
+	CompactAttribute cattr;
 	Form_pg_attribute attr = TupleDescAttr(tupdesc, attnum);
 	CompactAttribute tmp;
 
 	/*
+	 * Make a temp copy of the TupleDesc's CompactAttribute.  This may be a
+	 * shared TupleDesc and the attcacheoff might get changed by another
+	 * backend.
+	 */
+	memcpy(&cattr, &tupdesc->compact_attrs[attnum], sizeof(CompactAttribute));
+
+	/*
 	 * Populate the temporary CompactAttribute from the corresponding
 	 * Form_pg_attribute
 	 */
@@ -156,11 +163,11 @@ verify_compact_attribute(TupleDesc tupdesc, int attnum)
 	 * Make the attcacheoff match since it's been reset to -1 by
 	 * populate_compact_attribute_internal.  Same with attnullability.
 	 */
-	tmp.attcacheoff = cattr->attcacheoff;
-	tmp.attnullability = cattr->attnullability;
+	tmp.attcacheoff = cattr.attcacheoff;
+	tmp.attnullability = cattr.attnullability;
 
 	/* Check the freshly populated CompactAttribute matches the TupleDesc's */
-	Assert(memcmp(&tmp, cattr, sizeof(CompactAttribute)) == 0);
+	Assert(memcmp(&tmp, &cattr, sizeof(CompactAttribute)) == 0);
 #endif
 }
 
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index a6b701943d3..c0aa7d0222f 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -1058,11 +1058,11 @@ gistGetFakeLSN(Relation rel)
 }
 
 /*
- * This is a stratnum support function for GiST opclasses that use the
- * RT*StrategyNumber constants.
+ * This is a stratnum translation support function for GiST opclasses that use
+ * the RT*StrategyNumber constants.
  */
 Datum
-gist_stratnum_common(PG_FUNCTION_ARGS)
+gist_translate_cmptype_common(PG_FUNCTION_ARGS)
 {
 	CompareType cmptype = PG_GETARG_INT32(0);
 
@@ -1090,9 +1090,9 @@ gist_stratnum_common(PG_FUNCTION_ARGS)
 /*
  * Returns the opclass's private stratnum used for the given compare type.
  *
- * Calls the opclass's GIST_STRATNUM_PROC support function, if any,
- * and returns the result.
- * Returns InvalidStrategy if the function is not defined.
+ * Calls the opclass's GIST_TRANSLATE_CMPTYPE_PROC support function, if any,
+ * and returns the result.  Returns InvalidStrategy if the function is not
+ * defined.
  */
 StrategyNumber
 gisttranslatecmptype(CompareType cmptype, Oid opfamily)
@@ -1101,7 +1101,7 @@ gisttranslatecmptype(CompareType cmptype, Oid opfamily)
 	Datum		result;
 
 	/* Check whether the function is provided. */
-	funcid = get_opfamily_proc(opfamily, ANYOID, ANYOID, GIST_STRATNUM_PROC);
+	funcid = get_opfamily_proc(opfamily, ANYOID, ANYOID, GIST_TRANSLATE_CMPTYPE_PROC);
 	if (!OidIsValid(funcid))
 		return InvalidStrategy;
 
diff --git a/src/backend/access/gist/gistvalidate.c b/src/backend/access/gist/gistvalidate.c
index 2a49e6d20f0..2ed6f74fce9 100644
--- a/src/backend/access/gist/gistvalidate.c
+++ b/src/backend/access/gist/gistvalidate.c
@@ -138,7 +138,7 @@ gistvalidate(Oid opclassoid)
 				ok = check_amproc_signature(procform->amproc, VOIDOID, true,
 											1, 1, INTERNALOID);
 				break;
-			case GIST_STRATNUM_PROC:
+			case GIST_TRANSLATE_CMPTYPE_PROC:
 				ok = check_amproc_signature(procform->amproc, INT2OID, true,
 											1, 1, INT4OID) &&
 					procform->amproclefttype == ANYOID &&
@@ -265,7 +265,7 @@ gistvalidate(Oid opclassoid)
 		if (i == GIST_DISTANCE_PROC || i == GIST_FETCH_PROC ||
 			i == GIST_COMPRESS_PROC || i == GIST_DECOMPRESS_PROC ||
 			i == GIST_OPTIONS_PROC || i == GIST_SORTSUPPORT_PROC ||
-			i == GIST_STRATNUM_PROC)
+			i == GIST_TRANSLATE_CMPTYPE_PROC)
 			continue;			/* optional methods */
 		ereport(INFO,
 				(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
@@ -336,7 +336,7 @@ gistadjustmembers(Oid opfamilyoid,
 			case GIST_FETCH_PROC:
 			case GIST_OPTIONS_PROC:
 			case GIST_SORTSUPPORT_PROC:
-			case GIST_STRATNUM_PROC:
+			case GIST_TRANSLATE_CMPTYPE_PROC:
 				/* Optional, so force it to be a soft family dependency */
 				op->ref_is_hard = false;
 				op->ref_is_family = true;
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 9ec8cda1c68..0dcd6ee817e 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -213,6 +213,27 @@ static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
 #define TUPLOCK_from_mxstatus(status) \
 			(MultiXactStatusLock[(status)])
 
+/*
+ * Check that we have a valid snapshot if we might need TOAST access.
+ */
+static inline void
+AssertHasSnapshotForToast(Relation rel)
+{
+#ifdef USE_ASSERT_CHECKING
+
+	/* bootstrap mode in particular breaks this rule */
+	if (!IsNormalProcessingMode())
+		return;
+
+	/* if the relation doesn't have a TOAST table, we are good */
+	if (!OidIsValid(rel->rd_rel->reltoastrelid))
+		return;
+
+	Assert(HaveRegisteredOrActiveSnapshot());
+
+#endif							/* USE_ASSERT_CHECKING */
+}
+
 /* ----------------------------------------------------------------
  *						 heap support routines
  * ----------------------------------------------------------------
@@ -2066,6 +2087,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	Assert(HeapTupleHeaderGetNatts(tup->t_data) <=
 		   RelationGetNumberOfAttributes(relation));
 
+	AssertHasSnapshotForToast(relation);
+
 	/*
 	 * Fill in tuple header fields and toast the tuple if necessary.
 	 *
@@ -2343,6 +2366,8 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 	/* currently not needed (thus unsupported) for heap_multi_insert() */
 	Assert(!(options & HEAP_INSERT_NO_LOGICAL));
 
+	AssertHasSnapshotForToast(relation);
+
 	needwal = RelationNeedsWAL(relation);
 	saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
 												   HEAP_DEFAULT_FILLFACTOR);
@@ -2765,6 +2790,8 @@ heap_delete(Relation relation, ItemPointer tid,
 
 	Assert(ItemPointerIsValid(tid));
 
+	AssertHasSnapshotForToast(relation);
+
 	/*
 	 * Forbid this during a parallel operation, lest it allocate a combo CID.
 	 * Other workers might need that combo CID for visibility checks, and we
@@ -3260,6 +3287,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
 	Assert(HeapTupleHeaderGetNatts(newtup->t_data) <=
 		   RelationGetNumberOfAttributes(relation));
 
+	AssertHasSnapshotForToast(relation);
+
 	/*
 	 * Forbid this during a parallel operation, lest it allocate a combo CID.
 	 * Other workers might need that combo CID for visibility checks, and we
@@ -4953,7 +4982,7 @@ l3:
 					case LockWaitError:
 						if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
 														status, infomask, relation,
-														NULL, log_lock_failure))
+														NULL, log_lock_failures))
 							ereport(ERROR,
 									(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
 									 errmsg("could not obtain lock on row in relation \"%s\"",
@@ -4991,7 +5020,7 @@ l3:
 						}
 						break;
 					case LockWaitError:
-						if (!ConditionalXactLockTableWait(xwait, log_lock_failure))
+						if (!ConditionalXactLockTableWait(xwait, log_lock_failures))
 							ereport(ERROR,
 									(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
 									 errmsg("could not obtain lock on row in relation \"%s\"",
@@ -5256,7 +5285,7 @@ heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode,
 			break;
 
 		case LockWaitError:
-			if (!ConditionalLockTupleTuplock(relation, tid, mode, log_lock_failure))
+			if (!ConditionalLockTupleTuplock(relation, tid, mode, log_lock_failures))
 				ereport(ERROR,
 						(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
 						 errmsg("could not obtain lock on row in relation \"%s\"",
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index ac082fefa77..cb4bc35c93e 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -464,7 +464,7 @@ tuple_lock_retry:
 									return TM_WouldBlock;
 								break;
 							case LockWaitError:
-								if (!ConditionalXactLockTableWait(SnapshotDirty.xmax, log_lock_failure))
+								if (!ConditionalXactLockTableWait(SnapshotDirty.xmax, log_lock_failures))
 									ereport(ERROR,
 											(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
 											 errmsg("could not obtain lock on row in relation \"%s\"",
diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c
index 30f4c2d3c67..eb4bd3d6ae3 100644
--- a/src/backend/access/heap/heapam_xlog.c
+++ b/src/backend/access/heap/heapam_xlog.c
@@ -438,6 +438,9 @@ heap_xlog_insert(XLogReaderState *record)
 	ItemPointerSetBlockNumber(&target_tid, blkno);
 	ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
 
+	/* No freezing in the heap_insert() code path */
+	Assert(!(xlrec->flags & XLH_INSERT_ALL_FROZEN_SET));
+
 	/*
 	 * The visibility map may need to be fixed even if the heap page is
 	 * already up-to-date.
@@ -508,10 +511,6 @@ heap_xlog_insert(XLogReaderState *record)
 		if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
 			PageClearAllVisible(page);
 
-		/* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */
-		if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)
-			PageSetAllVisible(page);
-
 		MarkBufferDirty(buffer);
 	}
 	if (BufferIsValid(buffer))
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index f28326bad09..14036c27e87 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -423,7 +423,7 @@ typedef struct LVSavedErrInfo
 /* non-export function prototypes */
 static void lazy_scan_heap(LVRelState *vacrel);
 static void heap_vacuum_eager_scan_setup(LVRelState *vacrel,
-										 VacuumParams *params);
+										 const VacuumParams params);
 static BlockNumber heap_vac_scan_next_block(ReadStream *stream,
 											void *callback_private_data,
 											void *per_buffer_data);
@@ -431,7 +431,7 @@ static void find_next_unskippable_block(LVRelState *vacrel, bool *skipsallvis);
 static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf,
 								   BlockNumber blkno, Page page,
 								   bool sharelock, Buffer vmbuffer);
-static void lazy_scan_prune(LVRelState *vacrel, Buffer buf,
+static int	lazy_scan_prune(LVRelState *vacrel, Buffer buf,
 							BlockNumber blkno, Page page,
 							Buffer vmbuffer, bool all_visible_according_to_vm,
 							bool *has_lpdead_items, bool *vm_page_frozen);
@@ -485,7 +485,7 @@ static void restore_vacuum_error_info(LVRelState *vacrel,
  * vacuum options or for relfrozenxid/relminmxid advancement.
  */
 static void
-heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params)
+heap_vacuum_eager_scan_setup(LVRelState *vacrel, const VacuumParams params)
 {
 	uint32		randseed;
 	BlockNumber allvisible;
@@ -504,7 +504,7 @@ heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params)
 	vacrel->eager_scan_remaining_successes = 0;
 
 	/* If eager scanning is explicitly disabled, just return. */
-	if (params->max_eager_freeze_failure_rate == 0)
+	if (params.max_eager_freeze_failure_rate == 0)
 		return;
 
 	/*
@@ -581,11 +581,11 @@ heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params)
 
 	vacrel->next_eager_scan_region_start = randseed % EAGER_SCAN_REGION_SIZE;
 
-	Assert(params->max_eager_freeze_failure_rate > 0 &&
-		   params->max_eager_freeze_failure_rate <= 1);
+	Assert(params.max_eager_freeze_failure_rate > 0 &&
+		   params.max_eager_freeze_failure_rate <= 1);
 
 	vacrel->eager_scan_max_fails_per_region =
-		params->max_eager_freeze_failure_rate *
+		params.max_eager_freeze_failure_rate *
 		EAGER_SCAN_REGION_SIZE;
 
 	/*
@@ -612,7 +612,7 @@ heap_vacuum_eager_scan_setup(LVRelState *vacrel, VacuumParams *params)
  *		and locked the relation.
  */
 void
-heap_vacuum_rel(Relation rel, VacuumParams *params,
+heap_vacuum_rel(Relation rel, const VacuumParams params,
 				BufferAccessStrategy bstrategy)
 {
 	LVRelState *vacrel;
@@ -634,9 +634,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 	ErrorContextCallback errcallback;
 	char	  **indnames = NULL;
 
-	verbose = (params->options & VACOPT_VERBOSE) != 0;
+	verbose = (params.options & VACOPT_VERBOSE) != 0;
 	instrument = (verbose || (AmAutoVacuumWorkerProcess() &&
-							  params->log_min_duration >= 0));
+							  params.log_min_duration >= 0));
 	if (instrument)
 	{
 		pg_rusage_init(&ru0);
@@ -699,9 +699,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 	 * The truncate param allows user to avoid attempting relation truncation,
 	 * though it can't force truncation to happen.
 	 */
-	Assert(params->index_cleanup != VACOPTVALUE_UNSPECIFIED);
-	Assert(params->truncate != VACOPTVALUE_UNSPECIFIED &&
-		   params->truncate != VACOPTVALUE_AUTO);
+	Assert(params.index_cleanup != VACOPTVALUE_UNSPECIFIED);
+	Assert(params.truncate != VACOPTVALUE_UNSPECIFIED &&
+		   params.truncate != VACOPTVALUE_AUTO);
 
 	/*
 	 * While VacuumFailSafeActive is reset to false before calling this, we
@@ -711,14 +711,14 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 	vacrel->consider_bypass_optimization = true;
 	vacrel->do_index_vacuuming = true;
 	vacrel->do_index_cleanup = true;
-	vacrel->do_rel_truncate = (params->truncate != VACOPTVALUE_DISABLED);
-	if (params->index_cleanup == VACOPTVALUE_DISABLED)
+	vacrel->do_rel_truncate = (params.truncate != VACOPTVALUE_DISABLED);
+	if (params.index_cleanup == VACOPTVALUE_DISABLED)
 	{
 		/* Force disable index vacuuming up-front */
 		vacrel->do_index_vacuuming = false;
 		vacrel->do_index_cleanup = false;
 	}
-	else if (params->index_cleanup == VACOPTVALUE_ENABLED)
+	else if (params.index_cleanup == VACOPTVALUE_ENABLED)
 	{
 		/* Force index vacuuming.  Note that failsafe can still bypass. */
 		vacrel->consider_bypass_optimization = false;
@@ -726,7 +726,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 	else
 	{
 		/* Default/auto, make all decisions dynamically */
-		Assert(params->index_cleanup == VACOPTVALUE_AUTO);
+		Assert(params.index_cleanup == VACOPTVALUE_AUTO);
 	}
 
 	/* Initialize page counters explicitly (be tidy) */
@@ -757,7 +757,6 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 	vacrel->vm_new_visible_pages = 0;
 	vacrel->vm_new_visible_frozen_pages = 0;
 	vacrel->vm_new_frozen_pages = 0;
-	vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel);
 
 	/*
 	 * Get cutoffs that determine which deleted tuples are considered DEAD,
@@ -776,7 +775,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 	 * to increase the number of dead tuples it can prune away.)
 	 */
 	vacrel->aggressive = vacuum_get_cutoffs(rel, params, &vacrel->cutoffs);
+	vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel);
 	vacrel->vistest = GlobalVisTestFor(rel);
+
 	/* Initialize state used to track oldest extant XID/MXID */
 	vacrel->NewRelfrozenXid = vacrel->cutoffs.OldestXmin;
 	vacrel->NewRelminMxid = vacrel->cutoffs.OldestMxact;
@@ -788,7 +789,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 	 */
 	vacrel->skippedallvis = false;
 	skipwithvm = true;
-	if (params->options & VACOPT_DISABLE_PAGE_SKIPPING)
+	if (params.options & VACOPT_DISABLE_PAGE_SKIPPING)
 	{
 		/*
 		 * Force aggressive mode, and disable skipping blocks using the
@@ -829,7 +830,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 	 * is already dangerously old.)
 	 */
 	lazy_check_wraparound_failsafe(vacrel);
-	dead_items_alloc(vacrel, params->nworkers);
+	dead_items_alloc(vacrel, params.nworkers);
 
 	/*
 	 * Call lazy_scan_heap to perform all required heap pruning, index
@@ -946,9 +947,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 	{
 		TimestampTz endtime = GetCurrentTimestamp();
 
-		if (verbose || params->log_min_duration == 0 ||
+		if (verbose || params.log_min_duration == 0 ||
 			TimestampDifferenceExceeds(starttime, endtime,
-									   params->log_min_duration))
+									   params.log_min_duration))
 		{
 			long		secs_dur;
 			int			usecs_dur;
@@ -983,10 +984,10 @@ heap_vacuum_rel(Relation rel, VacuumParams *params,
 				 * Aggressiveness already reported earlier, in dedicated
 				 * VACUUM VERBOSE ereport
 				 */
-				Assert(!params->is_wraparound);
+				Assert(!params.is_wraparound);
 				msgfmt = _("finished vacuuming \"%s.%s.%s\": index scans: %d\n");
 			}
-			else if (params->is_wraparound)
+			else if (params.is_wraparound)
 			{
 				/*
 				 * While it's possible for a VACUUM to be both is_wraparound
@@ -1244,6 +1245,7 @@ lazy_scan_heap(LVRelState *vacrel)
 		Buffer		buf;
 		Page		page;
 		uint8		blk_info = 0;
+		int			ndeleted = 0;
 		bool		has_lpdead_items;
 		void	   *per_buffer_data = NULL;
 		bool		vm_page_frozen = false;
@@ -1386,10 +1388,10 @@ lazy_scan_heap(LVRelState *vacrel)
 		 * line pointers previously marked LP_DEAD.
 		 */
 		if (got_cleanup_lock)
-			lazy_scan_prune(vacrel, buf, blkno, page,
-							vmbuffer,
-							blk_info & VAC_BLK_ALL_VISIBLE_ACCORDING_TO_VM,
-							&has_lpdead_items, &vm_page_frozen);
+			ndeleted = lazy_scan_prune(vacrel, buf, blkno, page,
+									   vmbuffer,
+									   blk_info & VAC_BLK_ALL_VISIBLE_ACCORDING_TO_VM,
+									   &has_lpdead_items, &vm_page_frozen);
 
 		/*
 		 * Count an eagerly scanned page as a failure or a success.
@@ -1413,12 +1415,26 @@ lazy_scan_heap(LVRelState *vacrel)
 
 			if (vm_page_frozen)
 			{
-				Assert(vacrel->eager_scan_remaining_successes > 0);
-				vacrel->eager_scan_remaining_successes--;
+				if (vacrel->eager_scan_remaining_successes > 0)
+					vacrel->eager_scan_remaining_successes--;
 
 				if (vacrel->eager_scan_remaining_successes == 0)
 				{
 					/*
+					 * Report only once that we disabled eager scanning. We
+					 * may eagerly read ahead blocks in excess of the success
+					 * or failure caps before attempting to freeze them, so we
+					 * could reach here even after disabling additional eager
+					 * scanning.
+					 */
+					if (vacrel->eager_scan_max_fails_per_region > 0)
+						ereport(vacrel->verbose ? INFO : DEBUG2,
+								(errmsg("disabling eager scanning after freezing %u eagerly scanned blocks of relation \"%s.%s.%s\"",
+										orig_eager_scan_success_limit,
+										vacrel->dbname, vacrel->relnamespace,
+										vacrel->relname)));
+
+					/*
 					 * If we hit our success cap, permanently disable eager
 					 * scanning by setting the other eager scan management
 					 * fields to their disabled values.
@@ -1426,19 +1442,10 @@ lazy_scan_heap(LVRelState *vacrel)
 					vacrel->eager_scan_remaining_fails = 0;
 					vacrel->next_eager_scan_region_start = InvalidBlockNumber;
 					vacrel->eager_scan_max_fails_per_region = 0;
-
-					ereport(vacrel->verbose ? INFO : DEBUG2,
-							(errmsg("disabling eager scanning after freezing %u eagerly scanned blocks of \"%s.%s.%s\"",
-									orig_eager_scan_success_limit,
-									vacrel->dbname, vacrel->relnamespace,
-									vacrel->relname)));
 				}
 			}
-			else
-			{
-				Assert(vacrel->eager_scan_remaining_fails > 0);
+			else if (vacrel->eager_scan_remaining_fails > 0)
 				vacrel->eager_scan_remaining_fails--;
-			}
 		}
 
 		/*
@@ -1475,7 +1482,7 @@ lazy_scan_heap(LVRelState *vacrel)
 			 * table has indexes. There will only be newly-freed space if we
 			 * held the cleanup lock and lazy_scan_prune() was called.
 			 */
-			if (got_cleanup_lock && vacrel->nindexes == 0 && has_lpdead_items &&
+			if (got_cleanup_lock && vacrel->nindexes == 0 && ndeleted > 0 &&
 				blkno - next_fsm_block_to_vacuum >= VACUUM_FSM_EVERY_PAGES)
 			{
 				FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum,
@@ -1866,8 +1873,6 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno,
 		 */
 		if (!PageIsAllVisible(page))
 		{
-			uint8		old_vmbits;
-
 			START_CRIT_SECTION();
 
 			/* mark buffer dirty before writing a WAL record */
@@ -1887,24 +1892,16 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno,
 				log_newpage_buffer(buf, true);
 
 			PageSetAllVisible(page);
-			old_vmbits = visibilitymap_set(vacrel->rel, blkno, buf,
-										   InvalidXLogRecPtr,
-										   vmbuffer, InvalidTransactionId,
-										   VISIBILITYMAP_ALL_VISIBLE |
-										   VISIBILITYMAP_ALL_FROZEN);
+			visibilitymap_set(vacrel->rel, blkno, buf,
+							  InvalidXLogRecPtr,
+							  vmbuffer, InvalidTransactionId,
+							  VISIBILITYMAP_ALL_VISIBLE |
+							  VISIBILITYMAP_ALL_FROZEN);
 			END_CRIT_SECTION();
 
-			/*
-			 * If the page wasn't already set all-visible and/or all-frozen in
-			 * the VM, count it as newly set for logging.
-			 */
-			if ((old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0)
-			{
-				vacrel->vm_new_visible_pages++;
-				vacrel->vm_new_visible_frozen_pages++;
-			}
-			else if ((old_vmbits & VISIBILITYMAP_ALL_FROZEN) == 0)
-				vacrel->vm_new_frozen_pages++;
+			/* Count the newly all-frozen pages for logging */
+			vacrel->vm_new_visible_pages++;
+			vacrel->vm_new_visible_frozen_pages++;
 		}
 
 		freespace = PageGetHeapFreeSpace(page);
@@ -1940,8 +1937,10 @@ cmpOffsetNumbers(const void *a, const void *b)
  * *vm_page_frozen is set to true if the page is newly set all-frozen in the
  * VM. The caller currently only uses this for determining whether an eagerly
  * scanned page was successfully set all-frozen.
+ *
+ * Returns the number of tuples deleted from the page during HOT pruning.
  */
-static void
+static int
 lazy_scan_prune(LVRelState *vacrel,
 				Buffer buf,
 				BlockNumber blkno,
@@ -2212,6 +2211,8 @@ lazy_scan_prune(LVRelState *vacrel,
 			*vm_page_frozen = true;
 		}
 	}
+
+	return presult.ndeleted;
 }
 
 /*
@@ -2909,7 +2910,6 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
 	if (heap_page_is_all_visible(vacrel, buffer, &visibility_cutoff_xid,
 								 &all_frozen))
 	{
-		uint8		old_vmbits;
 		uint8		flags = VISIBILITYMAP_ALL_VISIBLE;
 
 		if (all_frozen)
@@ -2919,25 +2919,15 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer,
 		}
 
 		PageSetAllVisible(page);
-		old_vmbits = visibilitymap_set(vacrel->rel, blkno, buffer,
-									   InvalidXLogRecPtr,
-									   vmbuffer, visibility_cutoff_xid,
-									   flags);
-
-		/*
-		 * If the page wasn't already set all-visible and/or all-frozen in the
-		 * VM, count it as newly set for logging.
-		 */
-		if ((old_vmbits & VISIBILITYMAP_ALL_VISIBLE) == 0)
-		{
-			vacrel->vm_new_visible_pages++;
-			if (all_frozen)
-				vacrel->vm_new_visible_frozen_pages++;
-		}
+		visibilitymap_set(vacrel->rel, blkno, buffer,
+						  InvalidXLogRecPtr,
+						  vmbuffer, visibility_cutoff_xid,
+						  flags);
 
-		else if ((old_vmbits & VISIBILITYMAP_ALL_FROZEN) == 0 &&
-				 all_frozen)
-			vacrel->vm_new_frozen_pages++;
+		/* Count the newly set VM page for logging */
+		vacrel->vm_new_visible_pages++;
+		if (all_frozen)
+			vacrel->vm_new_visible_frozen_pages++;
 	}
 
 	/* Revert to the previous phase information for error traceback */
diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c
index a136e4bbfdf..21c519cd108 100644
--- a/src/backend/access/nbtree/nbtpreprocesskeys.c
+++ b/src/backend/access/nbtree/nbtpreprocesskeys.c
@@ -16,6 +16,7 @@
 #include "postgres.h"
 
 #include "access/nbtree.h"
+#include "common/int.h"
 #include "lib/qunique.h"
 #include "utils/array.h"
 #include "utils/lsyscache.h"
@@ -56,6 +57,8 @@ static void _bt_skiparray_strat_decrement(IndexScanDesc scan, ScanKey arraysk,
 										  BTArrayKeyInfo *array);
 static void _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk,
 										  BTArrayKeyInfo *array);
+static void _bt_unmark_keys(IndexScanDesc scan, int *keyDataMap);
+static int	_bt_reorder_array_cmp(const void *a, const void *b);
 static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan, int *new_numberOfKeys);
 static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap);
 static int	_bt_num_array_keys(IndexScanDesc scan, Oid *skip_eq_ops_out,
@@ -96,7 +99,7 @@ static int	_bt_compare_array_elements(const void *a, const void *b, void *arg);
  * incomplete sets of cross-type operators, we may fail to detect redundant
  * or contradictory keys, but we can survive that.)
  *
- * The output keys must be sorted by index attribute.  Presently we expect
+ * Required output keys are sorted by index attribute.  Presently we expect
  * (but verify) that the input keys are already so sorted --- this is done
  * by match_clauses_to_index() in indxpath.c.  Some reordering of the keys
  * within each attribute may be done as a byproduct of the processing here.
@@ -127,29 +130,36 @@ static int	_bt_compare_array_elements(const void *a, const void *b, void *arg);
  * This has the potential to be much more efficient than a full index scan
  * (though it behaves like a full scan when there's many distinct "x" values).
  *
- * If possible, redundant keys are eliminated: we keep only the tightest
+ * Typically, redundant keys are eliminated: we keep only the tightest
  * >/>= bound and the tightest </<= bound, and if there's an = key then
  * that's the only one returned.  (So, we return either a single = key,
  * or one or two boundary-condition keys for each attr.)  However, if we
  * cannot compare two keys for lack of a suitable cross-type operator,
- * we cannot eliminate either.  If there are two such keys of the same
- * operator strategy, the second one is just pushed into the output array
- * without further processing here.  We may also emit both >/>= or both
- * </<= keys if we can't compare them.  The logic about required keys still
- * works if we don't eliminate redundant keys.
- *
- * Note that one reason we need direction-sensitive required-key flags is
- * precisely that we may not be able to eliminate redundant keys.  Suppose
- * we have "x > 4::int AND x > 10::bigint", and we are unable to determine
- * which key is more restrictive for lack of a suitable cross-type operator.
- * _bt_first will arbitrarily pick one of the keys to do the initial
- * positioning with.  If it picks x > 4, then the x > 10 condition will fail
- * until we reach index entries > 10; but we can't stop the scan just because
- * x > 10 is failing.  On the other hand, if we are scanning backwards, then
- * failure of either key is indeed enough to stop the scan.  (In general, when
- * inequality keys are present, the initial-positioning code only promises to
- * position before the first possible match, not exactly at the first match,
- * for a forward scan; or after the last match for a backward scan.)
+ * we cannot eliminate either key.
+ *
+ * When all redundant keys could not be eliminated, we'll output a key array
+ * that can more or less be treated as if it had no redundant keys.  Suppose
+ * we have "x > 4::int AND x > 10::bigint AND x < 70", and we are unable to
+ * determine which > key is more restrictive for lack of a suitable cross-type
+ * operator.  We'll arbitrarily pick one of the > keys; the other > key won't
+ * be marked required.  Obviously, the scan will be less efficient if we
+ * choose x > 4 over x > 10 -- but it can still largely proceed as if there
+ * was only a single > condition.  "x > 10" will be placed at the end of the
+ * so->keyData[] output array.  It'll always be evaluated last, after the keys
+ * that could be marked required in the usual way (after "x > 4 AND x < 70").
+ * This can sometimes result in so->keyData[] keys that aren't even in index
+ * attribute order (if the qual involves multiple attributes).  The scan's
+ * required keys will still be in attribute order, though, so it can't matter.
+ *
+ * This scheme ensures that _bt_first always uses the same set of keys at the
+ * start of a forwards scan as those _bt_checkkeys uses to determine when to
+ * end a similar backwards scan (and vice-versa).  _bt_advance_array_keys
+ * depends on this: it expects to be able to reliably predict what the next
+ * _bt_first call will do by testing whether _bt_checkkeys' routines report
+ * that the final tuple on the page is past the end of matches for the scan's
+ * keys with the scan direction flipped.  If it is (if continuescan=false),
+ * then it follows that calling _bt_first will, at a minimum, relocate the
+ * scan to the very next leaf page (in the current scan direction).
  *
  * As a byproduct of this work, we can detect contradictory quals such
  * as "x = 1 AND x > 2".  If we see that, we return so->qual_ok = false,
@@ -188,7 +198,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
 	int			numberOfEqualCols;
 	ScanKey		inkeys;
 	BTScanKeyPreproc xform[BTMaxStrategyNumber];
-	bool		test_result;
+	bool		test_result,
+				redundant_key_kept = false;
 	AttrNumber	attno;
 	ScanKey		arrayKeyData;
 	int		   *keyDataMap = NULL;
@@ -388,7 +399,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
 						xform[j].inkey = NULL;
 						xform[j].inkeyi = -1;
 					}
-					/* else, cannot determine redundancy, keep both keys */
+					else
+						redundant_key_kept = true;
 				}
 				/* track number of attrs for which we have "=" keys */
 				numberOfEqualCols++;
@@ -409,6 +421,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
 					else
 						xform[BTLessStrategyNumber - 1].inkey = NULL;
 				}
+				else
+					redundant_key_kept = true;
 			}
 
 			/* try to keep only one of >, >= */
@@ -426,6 +440,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
 					else
 						xform[BTGreaterStrategyNumber - 1].inkey = NULL;
 				}
+				else
+					redundant_key_kept = true;
 			}
 
 			/*
@@ -466,25 +482,6 @@ _bt_preprocess_keys(IndexScanDesc scan)
 		/* check strategy this key's operator corresponds to */
 		j = inkey->sk_strategy - 1;
 
-		/* if row comparison, push it directly to the output array */
-		if (inkey->sk_flags & SK_ROW_HEADER)
-		{
-			ScanKey		outkey = &so->keyData[new_numberOfKeys++];
-
-			memcpy(outkey, inkey, sizeof(ScanKeyData));
-			if (arrayKeyData)
-				keyDataMap[new_numberOfKeys - 1] = i;
-			if (numberOfEqualCols == attno - 1)
-				_bt_mark_scankey_required(outkey);
-
-			/*
-			 * We don't support RowCompare using equality; such a qual would
-			 * mess up the numberOfEqualCols tracking.
-			 */
-			Assert(j != (BTEqualStrategyNumber - 1));
-			continue;
-		}
-
 		if (inkey->sk_strategy == BTEqualStrategyNumber &&
 			(inkey->sk_flags & SK_SEARCHARRAY))
 		{
@@ -593,9 +590,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
 				 * the new scan key.
 				 *
 				 * Note: We do things this way around so that our arrays are
-				 * always in the same order as their corresponding scan keys,
-				 * even with incomplete opfamilies.  _bt_advance_array_keys
-				 * depends on this.
+				 * always in the same order as their corresponding scan keys.
+				 * _bt_preprocess_array_keys_final expects this.
 				 */
 				ScanKey		outkey = &so->keyData[new_numberOfKeys++];
 
@@ -607,6 +603,7 @@ _bt_preprocess_keys(IndexScanDesc scan)
 				xform[j].inkey = inkey;
 				xform[j].inkeyi = i;
 				xform[j].arrayidx = arrayidx;
+				redundant_key_kept = true;
 			}
 		}
 	}
@@ -622,6 +619,15 @@ _bt_preprocess_keys(IndexScanDesc scan)
 	if (arrayKeyData)
 		_bt_preprocess_array_keys_final(scan, keyDataMap);
 
+	/*
+	 * If there are remaining redundant inequality keys, we must make sure
+	 * that each index attribute has no more than one required >/>= key, and
+	 * no more than one required </<= key.  Attributes that have one or more
+	 * required = keys now must keep only one required key (the first = key).
+	 */
+	if (unlikely(redundant_key_kept) && so->qual_ok)
+		_bt_unmark_keys(scan, keyDataMap);
+
 	/* Could pfree arrayKeyData/keyDataMap now, but not worth the cycles */
 }
 
@@ -746,9 +752,12 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
  *
  * Depending on the operator type, the key may be required for both scan
  * directions or just one.  Also, if the key is a row comparison header,
- * we have to mark its first subsidiary ScanKey as required.  (Subsequent
- * subsidiary ScanKeys are normally for lower-order columns, and thus
- * cannot be required, since they're after the first non-equality scankey.)
+ * we have to mark the appropriate subsidiary ScanKeys as required.  In such
+ * cases, the first subsidiary key is required, but subsequent ones are
+ * required only as long as they correspond to successive index columns and
+ * match the leading column as to sort direction.  Otherwise the row
+ * comparison ordering is different from the index ordering and so we can't
+ * stop the scan on the basis of those lower-order columns.
  *
  * Note: when we set required-key flag bits in a subsidiary scankey, we are
  * scribbling on a data structure belonging to the index AM's caller, not on
@@ -786,12 +795,25 @@ _bt_mark_scankey_required(ScanKey skey)
 	if (skey->sk_flags & SK_ROW_HEADER)
 	{
 		ScanKey		subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
+		AttrNumber	attno = skey->sk_attno;
 
 		/* First subkey should be same column/operator as the header */
-		Assert(subkey->sk_flags & SK_ROW_MEMBER);
-		Assert(subkey->sk_attno == skey->sk_attno);
+		Assert(subkey->sk_attno == attno);
 		Assert(subkey->sk_strategy == skey->sk_strategy);
-		subkey->sk_flags |= addflags;
+
+		for (;;)
+		{
+			Assert(subkey->sk_flags & SK_ROW_MEMBER);
+			if (subkey->sk_attno != attno)
+				break;			/* non-adjacent key, so not required */
+			if (subkey->sk_strategy != skey->sk_strategy)
+				break;			/* wrong direction, so not required */
+			subkey->sk_flags |= addflags;
+			if (subkey->sk_flags & SK_ROW_END)
+				break;
+			subkey++;
+			attno++;
+		}
 	}
 }
 
@@ -847,8 +869,7 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
 				cmp_op;
 	StrategyNumber strat;
 
-	Assert(!((leftarg->sk_flags | rightarg->sk_flags) &
-			 (SK_ROW_HEADER | SK_ROW_MEMBER)));
+	Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_ROW_MEMBER));
 
 	/*
 	 * First, deal with cases where one or both args are NULL.  This should
@@ -925,6 +946,16 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
 	}
 
 	/*
+	 * We don't yet know how to determine redundancy when it involves a row
+	 * compare key (barring simple cases involving IS NULL/IS NOT NULL)
+	 */
+	if ((leftarg->sk_flags | rightarg->sk_flags) & SK_ROW_HEADER)
+	{
+		Assert(!((leftarg->sk_flags | rightarg->sk_flags) & SK_BT_SKIP));
+		return false;
+	}
+
+	/*
 	 * If either leftarg or rightarg are equality-type array scankeys, we need
 	 * specialized handling (since by now we know that IS NULL wasn't used)
 	 */
@@ -1468,6 +1499,283 @@ _bt_skiparray_strat_increment(IndexScanDesc scan, ScanKey arraysk,
 }
 
 /*
+ *	_bt_unmark_keys() -- make superfluous required keys nonrequired after all
+ *
+ * When _bt_preprocess_keys fails to eliminate one or more redundant keys, it
+ * calls here to make sure that no index attribute has more than one > or >=
+ * key marked required, and no more than one required < or <= key.  Attributes
+ * with = keys will always get one = key as their required key.  All other
+ * keys that were initially marked required get "unmarked" here.  That way,
+ * _bt_first and _bt_checkkeys will reliably agree on which keys to use to
+ * start and/or to end the scan.
+ *
+ * We also relocate keys that become/started out nonrequired to the end of
+ * so->keyData[].  That way, _bt_first and _bt_checkkeys cannot fail to reach
+ * a required key due to some earlier nonrequired key getting in the way.
+ *
+ * Only call here when _bt_compare_scankey_args returned false at least once
+ * (otherwise, calling here will just waste cycles).
+ */
+static void
+_bt_unmark_keys(IndexScanDesc scan, int *keyDataMap)
+{
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	AttrNumber	attno;
+	bool	   *unmarkikey;
+	int			nunmark,
+				nunmarked,
+				nkept,
+				firsti;
+	ScanKey		keepKeys,
+				unmarkKeys;
+	FmgrInfo   *keepOrderProcs = NULL,
+			   *unmarkOrderProcs = NULL;
+	bool		haveReqEquals,
+				haveReqForward,
+				haveReqBackward;
+
+	/*
+	 * Do an initial pass over so->keyData[] that determines which keys to
+	 * keep as required.  We expect so->keyData[] to still be in attribute
+	 * order when we're called (though we don't expect any particular order
+	 * among each attribute's keys).
+	 *
+	 * When both equality and inequality keys remain on a single attribute, we
+	 * *must* make sure that exactly one of the equalities remains required.
+	 * Any requiredness markings that we might leave on later keys/attributes
+	 * are predicated on there being required = keys on all prior columns.
+	 */
+	unmarkikey = palloc0(so->numberOfKeys * sizeof(bool));
+	nunmark = 0;
+
+	/* Set things up for first key's attribute */
+	attno = so->keyData[0].sk_attno;
+	firsti = 0;
+	haveReqEquals = false;
+	haveReqForward = false;
+	haveReqBackward = false;
+	for (int i = 0; i < so->numberOfKeys; i++)
+	{
+		ScanKey		origkey = &so->keyData[i];
+
+		if (origkey->sk_attno != attno)
+		{
+			/* Reset for next attribute */
+			attno = origkey->sk_attno;
+			firsti = i;
+
+			haveReqEquals = false;
+			haveReqForward = false;
+			haveReqBackward = false;
+		}
+
+		/* Equalities get priority over inequalities */
+		if (haveReqEquals)
+		{
+			/*
+			 * We already found the first "=" key for this attribute.  We've
+			 * already decided that all its other keys will be unmarked.
+			 */
+			Assert(!(origkey->sk_flags & SK_SEARCHNULL));
+			unmarkikey[i] = true;
+			nunmark++;
+			continue;
+		}
+		else if ((origkey->sk_flags & SK_BT_REQFWD) &&
+				 (origkey->sk_flags & SK_BT_REQBKWD))
+		{
+			/*
+			 * Found the first "=" key for attno.  All other attno keys will
+			 * be unmarked.
+			 */
+			Assert(origkey->sk_strategy == BTEqualStrategyNumber);
+
+			haveReqEquals = true;
+			for (int j = firsti; j < i; j++)
+			{
+				/* Unmark any prior inequality keys on attno after all */
+				if (!unmarkikey[j])
+				{
+					unmarkikey[j] = true;
+					nunmark++;
+				}
+			}
+			continue;
+		}
+
+		/* Deal with inequalities next */
+		if ((origkey->sk_flags & SK_BT_REQFWD) && !haveReqForward)
+		{
+			haveReqForward = true;
+			continue;
+		}
+		else if ((origkey->sk_flags & SK_BT_REQBKWD) && !haveReqBackward)
+		{
+			haveReqBackward = true;
+			continue;
+		}
+
+		/*
+		 * We have either a redundant inequality key that will be unmarked, or
+		 * we have a key that wasn't marked required in the first place
+		 */
+		unmarkikey[i] = true;
+		nunmark++;
+	}
+
+	/* Should only be called when _bt_compare_scankey_args reported failure */
+	Assert(nunmark > 0);
+
+	/*
+	 * Next, allocate temp arrays: one for required keys that'll remain
+	 * required, the other for all remaining keys
+	 */
+	unmarkKeys = palloc(nunmark * sizeof(ScanKeyData));
+	keepKeys = palloc((so->numberOfKeys - nunmark) * sizeof(ScanKeyData));
+	nunmarked = 0;
+	nkept = 0;
+	if (so->numArrayKeys)
+	{
+		unmarkOrderProcs = palloc(nunmark * sizeof(FmgrInfo));
+		keepOrderProcs = palloc((so->numberOfKeys - nunmark) * sizeof(FmgrInfo));
+	}
+
+	/*
+	 * Next, copy the contents of so->keyData[] into the appropriate temp
+	 * array.
+	 *
+	 * Scans with = array keys need us to maintain invariants around the order
+	 * of so->orderProcs[] and so->arrayKeys[] relative to so->keyData[].  See
+	 * _bt_preprocess_array_keys_final for a full explanation.
+	 */
+	for (int i = 0; i < so->numberOfKeys; i++)
+	{
+		ScanKey		origkey = &so->keyData[i];
+		ScanKey		unmark;
+
+		if (!unmarkikey[i])
+		{
+			/*
+			 * Key gets to keep its original requiredness markings.
+			 *
+			 * Key will stay in its original position, unless we're going to
+			 * unmark an earlier key (in which case this key gets moved back).
+			 */
+			memcpy(keepKeys + nkept, origkey, sizeof(ScanKeyData));
+
+			if (so->numArrayKeys)
+			{
+				keyDataMap[i] = nkept;
+				memcpy(keepOrderProcs + nkept, &so->orderProcs[i],
+					   sizeof(FmgrInfo));
+			}
+
+			nkept++;
+			continue;
+		}
+
+		/*
+		 * Key will be unmarked as needed, and moved to the end of the array,
+		 * next to other keys that will become (or always were) nonrequired
+		 */
+		unmark = unmarkKeys + nunmarked;
+		memcpy(unmark, origkey, sizeof(ScanKeyData));
+
+		if (so->numArrayKeys)
+		{
+			keyDataMap[i] = (so->numberOfKeys - nunmark) + nunmarked;
+			memcpy(&unmarkOrderProcs[nunmarked], &so->orderProcs[i],
+				   sizeof(FmgrInfo));
+		}
+
+		/*
+		 * Preprocessing only generates skip arrays when it knows that they'll
+		 * be the only required = key on the attr.  We'll never unmark them.
+		 */
+		Assert(!(unmark->sk_flags & SK_BT_SKIP));
+
+		/*
+		 * Also shouldn't have to unmark an IS NULL or an IS NOT NULL key.
+		 * They aren't cross-type, so an incomplete opfamily can't matter.
+		 */
+		Assert(!(unmark->sk_flags & SK_ISNULL) ||
+			   !(unmark->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)));
+
+		/* Clear requiredness flags on redundant key (and on any subkeys) */
+		unmark->sk_flags &= ~(SK_BT_REQFWD | SK_BT_REQBKWD);
+		if (unmark->sk_flags & SK_ROW_HEADER)
+		{
+			ScanKey		subkey = (ScanKey) DatumGetPointer(unmark->sk_argument);
+
+			Assert(subkey->sk_strategy == unmark->sk_strategy);
+			for (;;)
+			{
+				Assert(subkey->sk_flags & SK_ROW_MEMBER);
+				subkey->sk_flags &= ~(SK_BT_REQFWD | SK_BT_REQBKWD);
+				if (subkey->sk_flags & SK_ROW_END)
+					break;
+				subkey++;
+			}
+		}
+
+		nunmarked++;
+	}
+
+	/* Copy both temp arrays back into so->keyData[] to reorder */
+	Assert(nkept == so->numberOfKeys - nunmark);
+	Assert(nunmarked == nunmark);
+	memcpy(so->keyData, keepKeys, sizeof(ScanKeyData) * nkept);
+	memcpy(so->keyData + nkept, unmarkKeys, sizeof(ScanKeyData) * nunmarked);
+
+	/* Done with temp arrays */
+	pfree(unmarkikey);
+	pfree(keepKeys);
+	pfree(unmarkKeys);
+
+	/*
+	 * Now copy so->orderProcs[] temp entries needed by scans with = array
+	 * keys back (just like with the so->keyData[] temp arrays)
+	 */
+	if (so->numArrayKeys)
+	{
+		memcpy(so->orderProcs, keepOrderProcs, sizeof(FmgrInfo) * nkept);
+		memcpy(so->orderProcs + nkept, unmarkOrderProcs,
+			   sizeof(FmgrInfo) * nunmarked);
+
+		/* Also fix-up array->scan_key references */
+		for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
+		{
+			BTArrayKeyInfo *array = &so->arrayKeys[arridx];
+
+			array->scan_key = keyDataMap[array->scan_key];
+		}
+
+		/*
+		 * Sort so->arrayKeys[] based on its new BTArrayKeyInfo.scan_key
+		 * offsets, so that its order matches so->keyData[] order as expected
+		 */
+		qsort(so->arrayKeys, so->numArrayKeys, sizeof(BTArrayKeyInfo),
+			  _bt_reorder_array_cmp);
+
+		/* Done with temp arrays */
+		pfree(unmarkOrderProcs);
+		pfree(keepOrderProcs);
+	}
+}
+
+/*
+ * qsort comparator for reordering so->arrayKeys[] BTArrayKeyInfo entries
+ */
+static int
+_bt_reorder_array_cmp(const void *a, const void *b)
+{
+	BTArrayKeyInfo *arraya = (BTArrayKeyInfo *) a;
+	BTArrayKeyInfo *arrayb = (BTArrayKeyInfo *) b;
+
+	return pg_cmp_s32(arraya->scan_key, arrayb->scan_key);
+}
+
+/*
  *	_bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys
  *
  * If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 765659887af..fdff960c130 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -228,6 +228,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	bool		res;
 
+	Assert(scan->heapRelation != NULL);
+
 	/* btree indexes are never lossy */
 	scan->xs_recheck = false;
 
@@ -289,6 +291,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	int64		ntids = 0;
 	ItemPointer heapTid;
 
+	Assert(scan->heapRelation == NULL);
+
 	/* Each loop iteration performs another primitive index scan */
 	do
 	{
@@ -393,6 +397,34 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 		BTScanPosInvalidate(so->currPos);
 	}
 
+	/*
+	 * We prefer to eagerly drop leaf page pins before btgettuple returns.
+	 * This avoids making VACUUM wait to acquire a cleanup lock on the page.
+	 *
+	 * We cannot safely drop leaf page pins during index-only scans due to a
+	 * race condition involving VACUUM setting pages all-visible in the VM.
+	 * It's also unsafe for plain index scans that use a non-MVCC snapshot.
+	 *
+	 * When we drop pins eagerly, the mechanism that marks so->killedItems[]
+	 * index tuples LP_DEAD has to deal with concurrent TID recycling races.
+	 * The scheme used to detect unsafe TID recycling won't work when scanning
+	 * unlogged relations (since it involves saving an affected page's LSN).
+	 * Opt out of eager pin dropping during unlogged relation scans for now
+	 * (this is preferable to opting out of kill_prior_tuple LP_DEAD setting).
+	 *
+	 * Also opt out of dropping leaf page pins eagerly during bitmap scans.
+	 * Pins cannot be held for more than an instant during bitmap scans either
+	 * way, so we might as well avoid wasting cycles on acquiring page LSNs.
+	 *
+	 * See nbtree/README section on making concurrent TID recycling safe.
+	 *
+	 * Note: so->dropPin should never change across rescans.
+	 */
+	so->dropPin = (!scan->xs_want_itup &&
+				   IsMVCCSnapshot(scan->xs_snapshot) &&
+				   RelationNeedsWAL(scan->indexRelation) &&
+				   scan->heapRelation != NULL);
+
 	so->markItemIndex = -1;
 	so->needPrimScan = false;
 	so->scanBehind = false;
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index fe9a3886913..4af1ff1e9e5 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -25,7 +25,7 @@
 #include "utils/rel.h"
 
 
-static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
+static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
 static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
 							Buffer buf, bool forupdate, BTStack stack,
 							int access);
@@ -57,24 +57,29 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
 /*
  *	_bt_drop_lock_and_maybe_pin()
  *
- * Unlock the buffer; and if it is safe to release the pin, do that, too.
- * This will prevent vacuum from stalling in a blocked state trying to read a
- * page when a cursor is sitting on it.
- *
- * See nbtree/README section on making concurrent TID recycling safe.
+ * Unlock so->currPos.buf.  If scan is so->dropPin, drop the pin, too.
+ * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
  */
-static void
-_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
+static inline void
+_bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
 {
-	_bt_unlockbuf(scan->indexRelation, sp->buf);
-
-	if (IsMVCCSnapshot(scan->xs_snapshot) &&
-		RelationNeedsWAL(scan->indexRelation) &&
-		!scan->xs_want_itup)
+	if (!so->dropPin)
 	{
-		ReleaseBuffer(sp->buf);
-		sp->buf = InvalidBuffer;
+		/* Just drop the lock (not the pin) */
+		_bt_unlockbuf(rel, so->currPos.buf);
+		return;
 	}
+
+	/*
+	 * Drop both the lock and the pin.
+	 *
+	 * Have to set so->currPos.lsn so that _bt_killitems has a way to detect
+	 * when concurrent heap TID recycling by VACUUM might have taken place.
+	 */
+	Assert(RelationNeedsWAL(rel));
+	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
+	_bt_relbuf(rel, so->currPos.buf);
+	so->currPos.buf = InvalidBuffer;
 }
 
 /*
@@ -866,8 +871,8 @@ _bt_compare(Relation rel,
  *		if backwards scan, the last item) in the tree that satisfies the
  *		qualifications in the scan key.  On success exit, data about the
  *		matching tuple(s) on the page has been loaded into so->currPos.  We'll
- *		drop all locks and hold onto a pin on page's buffer, except when
- *		_bt_drop_lock_and_maybe_pin dropped the pin to avoid blocking VACUUM.
+ *		drop all locks and hold onto a pin on page's buffer, except during
+ *		so->dropPin scans, when we drop both the lock and the pin.
  *		_bt_returnitem sets the next item to return to scan on success exit.
  *
  * If there are no matching items in the index, we return false, with no
@@ -955,46 +960,51 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 
 	/*----------
 	 * Examine the scan keys to discover where we need to start the scan.
+	 * The selected scan keys (at most one per index column) are remembered by
+	 * storing their addresses into the local startKeys[] array.  The final
+	 * startKeys[] entry's strategy is set in strat_total. (Actually, there
+	 * are a couple of cases where we force a less/more restrictive strategy.)
 	 *
-	 * We want to identify the keys that can be used as starting boundaries;
-	 * these are =, >, or >= keys for a forward scan or =, <, <= keys for
-	 * a backwards scan.  We can use keys for multiple attributes so long as
-	 * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
-	 * a > or < boundary or find an attribute with no boundary (which can be
-	 * thought of as the same as "> -infinity"), we can't use keys for any
-	 * attributes to its right, because it would break our simplistic notion
-	 * of what initial positioning strategy to use.
+	 * We must use the key that was marked required (in the direction opposite
+	 * our own scan's) during preprocessing.  Each index attribute can only
+	 * have one such required key.  In general, the keys that we use to find
+	 * an initial position when scanning forwards are the same keys that end
+	 * the scan on the leaf level when scanning backwards (and vice-versa).
 	 *
 	 * When the scan keys include cross-type operators, _bt_preprocess_keys
-	 * may not be able to eliminate redundant keys; in such cases we will
-	 * arbitrarily pick a usable one for each attribute.  This is correct
-	 * but possibly not optimal behavior.  (For example, with keys like
-	 * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
-	 * x=5 would be more efficient.)  Since the situation only arises given
-	 * a poorly-worded query plus an incomplete opfamily, live with it.
+	 * may not be able to eliminate redundant keys; in such cases it will
+	 * arbitrarily pick a usable key for each attribute (and scan direction),
+	 * ensuring that there is no more than one key required in each direction.
+	 * We stop considering further keys once we reach the first nonrequired
+	 * key (which must come after all required keys), so this can't affect us.
+	 *
+	 * The required keys that we use as starting boundaries have to be =, >,
+	 * or >= keys for a forward scan or =, <, <= keys for a backwards scan.
+	 * We can use keys for multiple attributes so long as the prior attributes
+	 * had only =, >= (resp. =, <=) keys.  These rules are very similar to the
+	 * rules that preprocessing used to determine which keys to mark required.
+	 * We cannot always use every required key as a positioning key, though.
+	 * Skip arrays necessitate independently applying our own rules here.
+	 * Skip arrays are always generally considered = array keys, but we'll
+	 * nevertheless treat them as inequalities at certain points of the scan.
+	 * When that happens, it _might_ have implications for the number of
+	 * required keys that we can safely use for initial positioning purposes.
 	 *
-	 * When both equality and inequality keys appear for a single attribute
-	 * (again, only possible when cross-type operators appear), we *must*
-	 * select one of the equality keys for the starting point, because
-	 * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
-	 * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
-	 * start at x=4, we will fail and stop before reaching x=10.  If multiple
-	 * equality quals survive preprocessing, however, it doesn't matter which
-	 * one we use --- by definition, they are either redundant or
-	 * contradictory.
+	 * For example, a forward scan with a skip array on its leading attribute
+	 * (with no low_compare/high_compare) will have at least two required scan
+	 * keys, but we won't use any of them as boundary keys during the scan's
+	 * initial call here.  Our positioning key during the first call here can
+	 * be thought of as representing "> -infinity".  Similarly, if such a skip
+	 * array's low_compare is "a > 'foo'", then we position using "a > 'foo'"
+	 * during the scan's initial call here; a lower-order key such as "b = 42"
+	 * can't be used until the "a" array advances beyond MINVAL/low_compare.
 	 *
-	 * In practice we rarely see any "attribute boundary key gaps" here.
-	 * Preprocessing can usually backfill skip array keys for any attributes
-	 * that were omitted from the original scan->keyData[] input keys.  All
-	 * array keys are always considered = keys, but we'll sometimes need to
-	 * treat the current key value as if we were using an inequality strategy.
-	 * This happens with range skip arrays, which store inequality keys in the
-	 * array's low_compare/high_compare fields (used to find the first/last
-	 * set of matches, when = key will lack a usable sk_argument value).
-	 * These are always preferred over any redundant "standard" inequality
-	 * keys on the same column (per the usual rule about preferring = keys).
-	 * Note also that any column with an = skip array key can never have an
-	 * additional, contradictory = key.
+	 * On the other hand, if such a skip array's low_compare was "a >= 'foo'",
+	 * then we _can_ use "a >= 'foo' AND b = 42" during the initial call here.
+	 * A subsequent call here might have us use "a = 'fop' AND b = 42".  Note
+	 * that we treat = and >= as equivalent when scanning forwards (just as we
+	 * treat = and <= as equivalent when scanning backwards).  We effectively
+	 * do the same thing (though with a distinct "a" element/value) each time.
 	 *
 	 * All keys (with the exception of SK_SEARCHNULL keys and SK_BT_SKIP
 	 * array keys whose array is "null_elem=true") imply a NOT NULL qualifier.
@@ -1006,21 +1016,20 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 * traversing a lot of null entries at the start of the scan.
 	 *
 	 * In this loop, row-comparison keys are treated the same as keys on their
-	 * first (leftmost) columns.  We'll add on lower-order columns of the row
-	 * comparison below, if possible.
+	 * first (leftmost) columns.  We'll add all lower-order columns of the row
+	 * comparison that were marked required during preprocessing below.
 	 *
-	 * The selected scan keys (at most one per index column) are remembered by
-	 * storing their addresses into the local startKeys[] array.
-	 *
-	 * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
-	 * the next primitive index scan (for scans with array keys) based in part
-	 * on an understanding of how it'll enable us to reposition the scan.
-	 * They're directly aware of how we'll sometimes cons up an explicit
-	 * SK_SEARCHNOTNULL key.  They'll even end primitive scans by applying a
-	 * symmetric "deduce NOT NULL" rule of their own.  This allows top-level
-	 * scans to skip large groups of NULLs through repeated deductions about
-	 * key strictness (for a required inequality key) and whether NULLs in the
-	 * key's index column are stored last or first (relative to non-NULLs).
+	 * _bt_advance_array_keys needs to know exactly how we'll reposition the
+	 * scan (should it opt to schedule another primitive index scan).  It is
+	 * critical that primscans only be scheduled when they'll definitely make
+	 * some useful progress.  _bt_advance_array_keys does this by calling
+	 * _bt_checkkeys routines that report whether a tuple is past the end of
+	 * matches for the scan's keys (given the scan's current array elements).
+	 * If the page's final tuple is "after the end of matches" for a scan that
+	 * uses the *opposite* scan direction, then it must follow that it's also
+	 * "before the start of matches" for the actual current scan direction.
+	 * It is therefore essential that all of our initial positioning rules are
+	 * symmetric with _bt_checkkeys's corresponding continuescan=false rule.
 	 * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
 	 * need to be kept in sync.
 	 *----------
@@ -1029,18 +1038,17 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	if (so->numberOfKeys > 0)
 	{
 		AttrNumber	curattr;
-		ScanKey		chosen;
+		ScanKey		bkey;
 		ScanKey		impliesNN;
 		ScanKey		cur;
 
 		/*
-		 * chosen is the so-far-chosen key for the current attribute, if any.
-		 * We don't cast the decision in stone until we reach keys for the
-		 * next attribute.
+		 * bkey will be set to the key that preprocessing left behind as the
+		 * boundary key for this attribute, in this scan direction (if any)
 		 */
 		cur = so->keyData;
 		curattr = 1;
-		chosen = NULL;
+		bkey = NULL;
 		/* Also remember any scankey that implies a NOT NULL constraint */
 		impliesNN = NULL;
 
@@ -1053,23 +1061,29 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		{
 			if (i >= so->numberOfKeys || cur->sk_attno != curattr)
 			{
+				/* Done looking for the curattr boundary key */
+				Assert(bkey == NULL ||
+					   (bkey->sk_attno == curattr &&
+						(bkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
+				Assert(impliesNN == NULL ||
+					   (impliesNN->sk_attno == curattr &&
+						(impliesNN->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))));
+
 				/*
-				 * Done looking at keys for curattr.
-				 *
 				 * If this is a scan key for a skip array whose current
 				 * element is MINVAL, choose low_compare (when scanning
 				 * backwards it'll be MAXVAL, and we'll choose high_compare).
 				 *
-				 * Note: if the array's low_compare key makes 'chosen' NULL,
+				 * Note: if the array's low_compare key makes 'bkey' NULL,
 				 * then we behave as if the array's first element is -inf,
 				 * except when !array->null_elem implies a usable NOT NULL
 				 * constraint.
 				 */
-				if (chosen != NULL &&
-					(chosen->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
+				if (bkey != NULL &&
+					(bkey->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))
 				{
-					int			ikey = chosen - so->keyData;
-					ScanKey		skipequalitykey = chosen;
+					int			ikey = bkey - so->keyData;
+					ScanKey		skipequalitykey = bkey;
 					BTArrayKeyInfo *array = NULL;
 
 					for (int arridx = 0; arridx < so->numArrayKeys; arridx++)
@@ -1082,35 +1096,35 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 					if (ScanDirectionIsForward(dir))
 					{
 						Assert(!(skipequalitykey->sk_flags & SK_BT_MAXVAL));
-						chosen = array->low_compare;
+						bkey = array->low_compare;
 					}
 					else
 					{
 						Assert(!(skipequalitykey->sk_flags & SK_BT_MINVAL));
-						chosen = array->high_compare;
+						bkey = array->high_compare;
 					}
 
-					Assert(chosen == NULL ||
-						   chosen->sk_attno == skipequalitykey->sk_attno);
+					Assert(bkey == NULL ||
+						   bkey->sk_attno == skipequalitykey->sk_attno);
 
 					if (!array->null_elem)
 						impliesNN = skipequalitykey;
 					else
-						Assert(chosen == NULL && impliesNN == NULL);
+						Assert(bkey == NULL && impliesNN == NULL);
 				}
 
 				/*
 				 * If we didn't find a usable boundary key, see if we can
 				 * deduce a NOT NULL key
 				 */
-				if (chosen == NULL && impliesNN != NULL &&
+				if (bkey == NULL && impliesNN != NULL &&
 					((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
 					 ScanDirectionIsForward(dir) :
 					 ScanDirectionIsBackward(dir)))
 				{
 					/* Yes, so build the key in notnullkeys[keysz] */
-					chosen = &notnullkeys[keysz];
-					ScanKeyEntryInitialize(chosen,
+					bkey = &notnullkeys[keysz];
+					ScanKeyEntryInitialize(bkey,
 										   (SK_SEARCHNOTNULL | SK_ISNULL |
 											(impliesNN->sk_flags &
 											 (SK_BT_DESC | SK_BT_NULLS_FIRST))),
@@ -1125,12 +1139,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 				}
 
 				/*
-				 * If we still didn't find a usable boundary key, quit; else
-				 * save the boundary key pointer in startKeys.
+				 * If preprocessing didn't leave a usable boundary key, quit;
+				 * else save the boundary key pointer in startKeys[]
 				 */
-				if (chosen == NULL)
+				if (bkey == NULL)
 					break;
-				startKeys[keysz++] = chosen;
+				startKeys[keysz++] = bkey;
 
 				/*
 				 * We can only consider adding more boundary keys when the one
@@ -1138,7 +1152,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 				 * (during backwards scans we can only do so when the key that
 				 * we just added to startKeys[] uses the = or <= strategy)
 				 */
-				strat_total = chosen->sk_strategy;
+				strat_total = bkey->sk_strategy;
 				if (strat_total == BTGreaterStrategyNumber ||
 					strat_total == BTLessStrategyNumber)
 					break;
@@ -1149,19 +1163,19 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 				 * make strat_total > or < (and stop adding boundary keys).
 				 * This can only happen with opclasses that lack skip support.
 				 */
-				if (chosen->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
+				if (bkey->sk_flags & (SK_BT_NEXT | SK_BT_PRIOR))
 				{
-					Assert(chosen->sk_flags & SK_BT_SKIP);
+					Assert(bkey->sk_flags & SK_BT_SKIP);
 					Assert(strat_total == BTEqualStrategyNumber);
 
 					if (ScanDirectionIsForward(dir))
 					{
-						Assert(!(chosen->sk_flags & SK_BT_PRIOR));
+						Assert(!(bkey->sk_flags & SK_BT_PRIOR));
 						strat_total = BTGreaterStrategyNumber;
 					}
 					else
 					{
-						Assert(!(chosen->sk_flags & SK_BT_NEXT));
+						Assert(!(bkey->sk_flags & SK_BT_NEXT));
 						strat_total = BTLessStrategyNumber;
 					}
 
@@ -1175,24 +1189,30 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 
 				/*
 				 * Done if that was the last scan key output by preprocessing.
-				 * Also done if there is a gap index attribute that lacks a
-				 * usable key (only possible when preprocessing was unable to
-				 * generate a skip array key to "fill in the gap").
+				 * Also done if we've now examined all keys marked required.
 				 */
 				if (i >= so->numberOfKeys ||
-					cur->sk_attno != curattr + 1)
+					!(cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
 					break;
 
 				/*
 				 * Reset for next attr.
 				 */
+				Assert(cur->sk_attno == curattr + 1);
 				curattr = cur->sk_attno;
-				chosen = NULL;
+				bkey = NULL;
 				impliesNN = NULL;
 			}
 
 			/*
-			 * Can we use this key as a starting boundary for this attr?
+			 * If we've located the starting boundary key for curattr, we have
+			 * no interest in curattr's other required key
+			 */
+			if (bkey != NULL)
+				continue;
+
+			/*
+			 * Is this key the starting boundary key for curattr?
 			 *
 			 * If not, does it imply a NOT NULL constraint?  (Because
 			 * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
@@ -1202,27 +1222,20 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 			{
 				case BTLessStrategyNumber:
 				case BTLessEqualStrategyNumber:
-					if (chosen == NULL)
-					{
-						if (ScanDirectionIsBackward(dir))
-							chosen = cur;
-						else
-							impliesNN = cur;
-					}
+					if (ScanDirectionIsBackward(dir))
+						bkey = cur;
+					else if (impliesNN == NULL)
+						impliesNN = cur;
 					break;
 				case BTEqualStrategyNumber:
-					/* override any non-equality choice */
-					chosen = cur;
+					bkey = cur;
 					break;
 				case BTGreaterEqualStrategyNumber:
 				case BTGreaterStrategyNumber:
-					if (chosen == NULL)
-					{
-						if (ScanDirectionIsForward(dir))
-							chosen = cur;
-						else
-							impliesNN = cur;
-					}
+					if (ScanDirectionIsForward(dir))
+						bkey = cur;
+					else if (impliesNN == NULL)
+						impliesNN = cur;
 					break;
 			}
 		}
@@ -1248,16 +1261,18 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	Assert(keysz <= INDEX_MAX_KEYS);
 	for (int i = 0; i < keysz; i++)
 	{
-		ScanKey		cur = startKeys[i];
+		ScanKey		bkey = startKeys[i];
 
-		Assert(cur->sk_attno == i + 1);
+		Assert(bkey->sk_attno == i + 1);
 
-		if (cur->sk_flags & SK_ROW_HEADER)
+		if (bkey->sk_flags & SK_ROW_HEADER)
 		{
 			/*
 			 * Row comparison header: look to the first row member instead
 			 */
-			ScanKey		subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
+			ScanKey		subkey = (ScanKey) DatumGetPointer(bkey->sk_argument);
+			bool		loosen_strat = false,
+						tighten_strat = false;
 
 			/*
 			 * Cannot be a NULL in the first row member: _bt_preprocess_keys
@@ -1265,122 +1280,160 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 			 * ever getting this far
 			 */
 			Assert(subkey->sk_flags & SK_ROW_MEMBER);
-			Assert(subkey->sk_attno == cur->sk_attno);
+			Assert(subkey->sk_attno == bkey->sk_attno);
 			Assert(!(subkey->sk_flags & SK_ISNULL));
 
 			/*
+			 * This is either a > or >= key (during backwards scans it is
+			 * either < or <=) that was marked required during preprocessing.
+			 * Later so->keyData[] keys can't have been marked required, so
+			 * our row compare header key must be the final startKeys[] entry.
+			 */
+			Assert(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD));
+			Assert(i == keysz - 1);
+
+			/*
 			 * The member scankeys are already in insertion format (ie, they
 			 * have sk_func = 3-way-comparison function)
 			 */
 			memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
 
 			/*
-			 * If the row comparison is the last positioning key we accepted,
-			 * try to add additional keys from the lower-order row members.
-			 * (If we accepted independent conditions on additional index
-			 * columns, we use those instead --- doesn't seem worth trying to
-			 * determine which is more restrictive.)  Note that this is OK
-			 * even if the row comparison is of ">" or "<" type, because the
-			 * condition applied to all but the last row member is effectively
-			 * ">=" or "<=", and so the extra keys don't break the positioning
-			 * scheme.  But, by the same token, if we aren't able to use all
-			 * the row members, then the part of the row comparison that we
-			 * did use has to be treated as just a ">=" or "<=" condition, and
-			 * so we'd better adjust strat_total accordingly.
+			 * Now look to later row compare members.
+			 *
+			 * If there's an "index attribute gap" between two row compare
+			 * members, the second member won't have been marked required, and
+			 * so can't be used as a starting boundary key here.  The part of
+			 * the row comparison that we do still use has to be treated as a
+			 * ">=" or "<=" condition.  For example, a qual "(a, c) > (1, 42)"
+			 * with an omitted intervening index attribute "b" will use an
+			 * insertion scan key "a >= 1".  Even the first "a = 1" tuple on
+			 * the leaf level might satisfy the row compare qual.
+			 *
+			 * We're able to use a _more_ restrictive strategy when we reach a
+			 * NULL row compare member, since they're always unsatisfiable.
+			 * For example, a qual "(a, b, c) >= (1, NULL, 77)" will use an
+			 * insertion scan key "a > 1".  All tuples where "a = 1" cannot
+			 * possibly satisfy the row compare qual, so this is safe.
 			 */
-			if (i == keysz - 1)
+			Assert(!(subkey->sk_flags & SK_ROW_END));
+			for (;;)
 			{
-				bool		used_all_subkeys = false;
+				subkey++;
+				Assert(subkey->sk_flags & SK_ROW_MEMBER);
 
-				Assert(!(subkey->sk_flags & SK_ROW_END));
-				for (;;)
+				if (subkey->sk_flags & SK_ISNULL)
 				{
-					subkey++;
-					Assert(subkey->sk_flags & SK_ROW_MEMBER);
-					if (subkey->sk_attno != keysz + 1)
-						break;	/* out-of-sequence, can't use it */
-					if (subkey->sk_strategy != cur->sk_strategy)
-						break;	/* wrong direction, can't use it */
-					if (subkey->sk_flags & SK_ISNULL)
-						break;	/* can't use null keys */
-					Assert(keysz < INDEX_MAX_KEYS);
-					memcpy(inskey.scankeys + keysz, subkey,
-						   sizeof(ScanKeyData));
-					keysz++;
-					if (subkey->sk_flags & SK_ROW_END)
-					{
-						used_all_subkeys = true;
-						break;
-					}
+					/*
+					 * NULL member key, can only use earlier keys.
+					 *
+					 * We deliberately avoid checking if this key is marked
+					 * required.  All earlier keys are required, and this key
+					 * is unsatisfiable either way, so we can't miss anything.
+					 */
+					tighten_strat = true;
+					break;
 				}
-				if (!used_all_subkeys)
+
+				if (!(subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
 				{
-					switch (strat_total)
-					{
-						case BTLessStrategyNumber:
-							strat_total = BTLessEqualStrategyNumber;
-							break;
-						case BTGreaterStrategyNumber:
-							strat_total = BTGreaterEqualStrategyNumber;
-							break;
-					}
+					/* nonrequired member key, can only use earlier keys */
+					loosen_strat = true;
+					break;
 				}
-				break;			/* done with outer loop */
+
+				Assert(subkey->sk_attno == keysz + 1);
+				Assert(subkey->sk_strategy == bkey->sk_strategy);
+				Assert(keysz < INDEX_MAX_KEYS);
+
+				memcpy(inskey.scankeys + keysz, subkey,
+					   sizeof(ScanKeyData));
+				keysz++;
+				if (subkey->sk_flags & SK_ROW_END)
+					break;
 			}
-		}
-		else
-		{
-			/*
-			 * Ordinary comparison key.  Transform the search-style scan key
-			 * to an insertion scan key by replacing the sk_func with the
-			 * appropriate btree comparison function.
-			 *
-			 * If scankey operator is not a cross-type comparison, we can use
-			 * the cached comparison function; otherwise gotta look it up in
-			 * the catalogs.  (That can't lead to infinite recursion, since no
-			 * indexscan initiated by syscache lookup will use cross-data-type
-			 * operators.)
-			 *
-			 * We support the convention that sk_subtype == InvalidOid means
-			 * the opclass input type; this is a hack to simplify life for
-			 * ScanKeyInit().
-			 */
-			if (cur->sk_subtype == rel->rd_opcintype[i] ||
-				cur->sk_subtype == InvalidOid)
+			Assert(!(loosen_strat && tighten_strat));
+			if (loosen_strat)
 			{
-				FmgrInfo   *procinfo;
-
-				procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
-				ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
-											   cur->sk_flags,
-											   cur->sk_attno,
-											   InvalidStrategy,
-											   cur->sk_subtype,
-											   cur->sk_collation,
-											   procinfo,
-											   cur->sk_argument);
+				/* Use less restrictive strategy (and fewer member keys) */
+				switch (strat_total)
+				{
+					case BTLessStrategyNumber:
+						strat_total = BTLessEqualStrategyNumber;
+						break;
+					case BTGreaterStrategyNumber:
+						strat_total = BTGreaterEqualStrategyNumber;
+						break;
+				}
 			}
-			else
+			if (tighten_strat)
 			{
-				RegProcedure cmp_proc;
-
-				cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
-											 rel->rd_opcintype[i],
-											 cur->sk_subtype,
-											 BTORDER_PROC);
-				if (!RegProcedureIsValid(cmp_proc))
-					elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
-						 BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
-						 cur->sk_attno, RelationGetRelationName(rel));
-				ScanKeyEntryInitialize(inskey.scankeys + i,
-									   cur->sk_flags,
-									   cur->sk_attno,
-									   InvalidStrategy,
-									   cur->sk_subtype,
-									   cur->sk_collation,
-									   cmp_proc,
-									   cur->sk_argument);
+				/* Use more restrictive strategy (and fewer member keys) */
+				switch (strat_total)
+				{
+					case BTLessEqualStrategyNumber:
+						strat_total = BTLessStrategyNumber;
+						break;
+					case BTGreaterEqualStrategyNumber:
+						strat_total = BTGreaterStrategyNumber;
+						break;
+				}
 			}
+
+			/* done adding to inskey (row comparison keys always come last) */
+			break;
+		}
+
+		/*
+		 * Ordinary comparison key/search-style key.
+		 *
+		 * Transform the search-style scan key to an insertion scan key by
+		 * replacing the sk_func with the appropriate btree 3-way-comparison
+		 * function.
+		 *
+		 * If scankey operator is not a cross-type comparison, we can use the
+		 * cached comparison function; otherwise gotta look it up in the
+		 * catalogs.  (That can't lead to infinite recursion, since no
+		 * indexscan initiated by syscache lookup will use cross-data-type
+		 * operators.)
+		 *
+		 * We support the convention that sk_subtype == InvalidOid means the
+		 * opclass input type; this hack simplifies life for ScanKeyInit().
+		 */
+		if (bkey->sk_subtype == rel->rd_opcintype[i] ||
+			bkey->sk_subtype == InvalidOid)
+		{
+			FmgrInfo   *procinfo;
+
+			procinfo = index_getprocinfo(rel, bkey->sk_attno, BTORDER_PROC);
+			ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
+										   bkey->sk_flags,
+										   bkey->sk_attno,
+										   InvalidStrategy,
+										   bkey->sk_subtype,
+										   bkey->sk_collation,
+										   procinfo,
+										   bkey->sk_argument);
+		}
+		else
+		{
+			RegProcedure cmp_proc;
+
+			cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
+										 rel->rd_opcintype[i],
+										 bkey->sk_subtype, BTORDER_PROC);
+			if (!RegProcedureIsValid(cmp_proc))
+				elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
+					 BTORDER_PROC, rel->rd_opcintype[i], bkey->sk_subtype,
+					 bkey->sk_attno, RelationGetRelationName(rel));
+			ScanKeyEntryInitialize(inskey.scankeys + i,
+								   bkey->sk_flags,
+								   bkey->sk_attno,
+								   InvalidStrategy,
+								   bkey->sk_subtype,
+								   bkey->sk_collation,
+								   cmp_proc,
+								   bkey->sk_argument);
 		}
 	}
 
@@ -1469,6 +1522,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 
 	if (!BufferIsValid(so->currPos.buf))
 	{
+		Assert(!so->needPrimScan);
+
 		/*
 		 * We only get here if the index is completely empty. Lock relation
 		 * because nothing finer to lock exists.  Without a buffer lock, it's
@@ -1487,7 +1542,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 
 		if (!BufferIsValid(so->currPos.buf))
 		{
-			Assert(!so->needPrimScan);
 			_bt_parallel_done(scan);
 			return false;
 		}
@@ -1610,7 +1664,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 	so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
 	so->currPos.prevPage = opaque->btpo_prev;
 	so->currPos.nextPage = opaque->btpo_next;
+	/* delay setting so->currPos.lsn until _bt_drop_lock_and_maybe_pin */
+	so->currPos.dir = dir;
+	so->currPos.nextTupleOffset = 0;
 
+	/* either moreRight or moreLeft should be set now (may be unset later) */
+	Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
+		   so->currPos.moreLeft);
 	Assert(!P_IGNORE(opaque));
 	Assert(BTScanPosIsPinned(so->currPos));
 	Assert(!so->needPrimScan);
@@ -1626,14 +1686,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 								 so->currPos.currPage);
 	}
 
-	/* initialize remaining currPos fields related to current page */
-	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
-	so->currPos.dir = dir;
-	so->currPos.nextTupleOffset = 0;
-	/* either moreLeft or moreRight should be set now (may be unset later) */
-	Assert(ScanDirectionIsForward(dir) ? so->currPos.moreRight :
-		   so->currPos.moreLeft);
-
 	PredicateLockPage(rel, so->currPos.currPage, scan->xs_snapshot);
 
 	/* initialize local variables */
@@ -2107,10 +2159,9 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
  *
  * Wrapper on _bt_readnextpage that performs final steps for the current page.
  *
- * On entry, if so->currPos.buf is valid the buffer is pinned but not locked.
- * If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped
- * the pin eagerly earlier on.  The scan must have so->currPos.currPage set to
- * a valid block, in any case.
+ * On entry, so->currPos must be valid.  Its buffer will be pinned, though
+ * never locked. (Actually, when so->dropPin there won't even be a pin held,
+ * though so->currPos.currPage must still be set to a valid block number.)
  */
 static bool
 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
@@ -2251,12 +2302,14 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 	 */
 	if (_bt_readpage(scan, dir, offnum, true))
 	{
+		Relation	rel = scan->indexRelation;
+
 		/*
 		 * _bt_readpage succeeded.  Drop the lock (and maybe the pin) on
 		 * so->currPos.buf in preparation for btgettuple returning tuples.
 		 */
 		Assert(BTScanPosIsPinned(so->currPos));
-		_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+		_bt_drop_lock_and_maybe_pin(rel, so);
 		return true;
 	}
 
@@ -2278,9 +2331,12 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
  * previously-saved right link or left link.  lastcurrblkno is the page that
  * was current at the point where the blkno link was saved, which we use to
  * reason about concurrent page splits/page deletions during backwards scans.
+ * In the common case where seized=false, blkno is either so->currPos.nextPage
+ * or so->currPos.prevPage, and lastcurrblkno is so->currPos.currPage.
  *
- * On entry, caller shouldn't hold any locks or pins on any page (we work
- * directly off of blkno and lastcurrblkno instead).  Parallel scan callers
+ * On entry, so->currPos shouldn't be locked by caller.  so->currPos.buf must
+ * be InvalidBuffer/unpinned as needed by caller (note that lastcurrblkno
+ * won't need to be read again in almost all cases).  Parallel scan callers
  * that seized the scan before calling here should pass seized=true; such a
  * caller's blkno and lastcurrblkno arguments come from the seized scan.
  * seized=false callers just pass us the blkno/lastcurrblkno taken from their
@@ -2294,11 +2350,11 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
  *
  * On success exit, so->currPos is updated to contain data from the next
  * interesting page, and we return true.  We hold a pin on the buffer on
- * success exit, except when _bt_drop_lock_and_maybe_pin decided it was safe
- * to eagerly drop the pin (to avoid blocking VACUUM).
+ * success exit (except during so->dropPin index scans, when we drop the pin
+ * eagerly to avoid blocking VACUUM).
  *
- * If there are no more matching records in the given direction, we drop all
- * locks and pins, invalidate so->currPos, and return false.
+ * If there are no more matching records in the given direction, we invalidate
+ * so->currPos (while ensuring it retains no locks or pins), and return false.
  *
  * We always release the scan for a parallel scan caller, regardless of
  * success or failure; we'll call _bt_parallel_release as soon as possible.
@@ -2413,7 +2469,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 	 */
 	Assert(so->currPos.currPage == blkno);
 	Assert(BTScanPosIsPinned(so->currPos));
-	_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+	_bt_drop_lock_and_maybe_pin(rel, so);
 
 	return true;
 }
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 3794cc924ad..9d70e89c1f3 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -105,7 +105,7 @@ typedef struct BTShared
 	int			scantuplesortstates;
 
 	/* Query ID, for report in worker processes */
-	uint64		queryid;
+	int64		queryid;
 
 	/*
 	 * workersdonecv is used to monitor the progress of workers.  All parallel
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 1a15dfcb7d3..9aed207995f 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -44,7 +44,6 @@ static bool _bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *arra
 static bool _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array);
 static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir,
 											 bool *skip_array_set);
-static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir);
 static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
 										 IndexTuple tuple, TupleDesc tupdesc, int tupnatts,
 										 bool readpagetup, int sktrig, bool *scanBehind);
@@ -52,7 +51,6 @@ static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
 								   IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
 								   int sktrig, bool sktrig_required);
 #ifdef USE_ASSERT_CHECKING
-static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir);
 static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan);
 #endif
 static bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir,
@@ -1035,73 +1033,6 @@ _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir,
 }
 
 /*
- * _bt_rewind_nonrequired_arrays() -- Rewind SAOP arrays not marked required
- *
- * Called when _bt_advance_array_keys decides to start a new primitive index
- * scan on the basis of the current scan position being before the position
- * that _bt_first is capable of repositioning the scan to by applying an
- * inequality operator required in the opposite-to-scan direction only.
- *
- * Although equality strategy scan keys (for both arrays and non-arrays alike)
- * are either marked required in both directions or in neither direction,
- * there is a sense in which non-required arrays behave like required arrays.
- * With a qual such as "WHERE a IN (100, 200) AND b >= 3 AND c IN (5, 6, 7)",
- * the scan key on "c" is non-required, but nevertheless enables positioning
- * the scan at the first tuple >= "(100, 3, 5)" on the leaf level during the
- * first descent of the tree by _bt_first.  Later on, there could also be a
- * second descent, that places the scan right before tuples >= "(200, 3, 5)".
- * _bt_first must never be allowed to build an insertion scan key whose "c"
- * entry is set to a value other than 5, the "c" array's first element/value.
- * (Actually, it's the first in the current scan direction.  This example uses
- * a forward scan.)
- *
- * Calling here resets the array scan key elements for the scan's non-required
- * arrays.  This is strictly necessary for correctness in a subset of cases
- * involving "required in opposite direction"-triggered primitive index scans.
- * Not all callers are at risk of _bt_first using a non-required array like
- * this, but advancement always resets the arrays when another primitive scan
- * is scheduled, just to keep things simple.  Array advancement even makes
- * sure to reset non-required arrays during scans that have no inequalities.
- * (Advancement still won't call here when there are no inequalities, though
- * that's just because it's all handled indirectly instead.)
- *
- * Note: _bt_verify_arrays_bt_first is called by an assertion to enforce that
- * everybody got this right.
- *
- * Note: In practice almost all SAOP arrays are marked required during
- * preprocessing (if necessary by generating skip arrays).  It is hardly ever
- * truly necessary to call here, but consistently doing so is simpler.
- */
-static void
-_bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir)
-{
-	Relation	rel = scan->indexRelation;
-	BTScanOpaque so = (BTScanOpaque) scan->opaque;
-	int			arrayidx = 0;
-
-	for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
-	{
-		ScanKey		cur = so->keyData + ikey;
-		BTArrayKeyInfo *array = NULL;
-
-		if (!(cur->sk_flags & SK_SEARCHARRAY) ||
-			cur->sk_strategy != BTEqualStrategyNumber)
-			continue;
-
-		array = &so->arrayKeys[arrayidx++];
-		Assert(array->scan_key == ikey);
-
-		if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
-			continue;
-
-		Assert(array->num_elems != -1); /* No non-required skip arrays */
-
-		_bt_array_set_low_or_high(rel, cur, array,
-								  ScanDirectionIsForward(dir));
-	}
-}
-
-/*
  * _bt_tuple_before_array_skeys() -- too early to advance required arrays?
  *
  * We always compare the tuple using the current array keys (which we assume
@@ -1380,8 +1311,6 @@ _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir)
 	 */
 	if (so->needPrimScan)
 	{
-		Assert(_bt_verify_arrays_bt_first(scan, dir));
-
 		/*
 		 * Flag was set -- must call _bt_first again, which will reset the
 		 * scan's needPrimScan flag
@@ -2007,14 +1936,7 @@ _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
 	 */
 	else if (has_required_opposite_direction_only && pstate->finaltup &&
 			 unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup)))
-	{
-		/*
-		 * Make sure that any SAOP arrays that were not marked required by
-		 * preprocessing are reset to their first element for this direction
-		 */
-		_bt_rewind_nonrequired_arrays(scan, dir);
 		goto new_prim_scan;
-	}
 
 continue_scan:
 
@@ -2045,8 +1967,6 @@ continue_scan:
 		 */
 		so->oppositeDirCheck = has_required_opposite_direction_only;
 
-		_bt_rewind_nonrequired_arrays(scan, dir);
-
 		/*
 		 * skip by setting "look ahead" mechanism's offnum for forwards scans
 		 * (backwards scans check scanBehind flag directly instead)
@@ -2143,48 +2063,6 @@ end_toplevel_scan:
 
 #ifdef USE_ASSERT_CHECKING
 /*
- * Verify that the scan's qual state matches what we expect at the point that
- * _bt_start_prim_scan is about to start a just-scheduled new primitive scan.
- *
- * We enforce a rule against non-required array scan keys: they must start out
- * with whatever element is the first for the scan's current scan direction.
- * See _bt_rewind_nonrequired_arrays comments for an explanation.
- */
-static bool
-_bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir)
-{
-	BTScanOpaque so = (BTScanOpaque) scan->opaque;
-	int			arrayidx = 0;
-
-	for (int ikey = 0; ikey < so->numberOfKeys; ikey++)
-	{
-		ScanKey		cur = so->keyData + ikey;
-		BTArrayKeyInfo *array = NULL;
-		int			first_elem_dir;
-
-		if (!(cur->sk_flags & SK_SEARCHARRAY) ||
-			cur->sk_strategy != BTEqualStrategyNumber)
-			continue;
-
-		array = &so->arrayKeys[arrayidx++];
-
-		if (((cur->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) ||
-			((cur->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)))
-			continue;
-
-		if (ScanDirectionIsForward(dir))
-			first_elem_dir = 0;
-		else
-			first_elem_dir = array->num_elems - 1;
-
-		if (array->cur_elem != first_elem_dir)
-			return false;
-	}
-
-	return _bt_verify_keys_with_arraykeys(scan);
-}
-
-/*
  * Verify that the scan's "so->keyData[]" scan keys are in agreement with
  * its array key state
  */
@@ -2194,6 +2072,7 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan)
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	int			last_sk_attno = InvalidAttrNumber,
 				arrayidx = 0;
+	bool		nonrequiredseen = false;
 
 	if (!so->qual_ok)
 		return false;
@@ -2217,8 +2096,16 @@ _bt_verify_keys_with_arraykeys(IndexScanDesc scan)
 		if (array->num_elems != -1 &&
 			cur->sk_argument != array->elem_values[array->cur_elem])
 			return false;
-		if (last_sk_attno > cur->sk_attno)
-			return false;
+		if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))
+		{
+			if (last_sk_attno > cur->sk_attno)
+				return false;
+			if (nonrequiredseen)
+				return false;
+		}
+		else
+			nonrequiredseen = true;
+
 		last_sk_attno = cur->sk_attno;
 	}
 
@@ -2551,37 +2438,12 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate)
 		if (!(key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)))
 		{
 			/* Scan key isn't marked required (corner case) */
-			Assert(!(key->sk_flags & SK_ROW_HEADER));
 			break;				/* unsafe */
 		}
 		if (key->sk_flags & SK_ROW_HEADER)
 		{
-			/*
-			 * RowCompare inequality.
-			 *
-			 * Only the first subkey from a RowCompare can ever be marked
-			 * required (that happens when the row header is marked required).
-			 * There is no simple, general way for us to transitively deduce
-			 * whether or not every tuple on the page satisfies a RowCompare
-			 * key based only on firsttup and lasttup -- so we just give up.
-			 */
-			if (!start_past_saop_eq && !so->skipScan)
-				break;			/* unsafe to go further */
-
-			/*
-			 * We have to be even more careful with RowCompares that come
-			 * after an array: we assume it's unsafe to even bypass the array.
-			 * Calling _bt_start_array_keys to recover the scan's arrays
-			 * following use of forcenonrequired mode isn't compatible with
-			 * _bt_check_rowcompare's continuescan=false behavior with NULL
-			 * row compare members.  _bt_advance_array_keys must not make a
-			 * decision on the basis of a key not being satisfied in the
-			 * opposite-to-scan direction until the scan reaches a leaf page
-			 * where the same key begins to be satisfied in scan direction.
-			 * The _bt_first !used_all_subkeys behavior makes this limitation
-			 * hard to work around some other way.
-			 */
-			return;				/* completely unsafe to set pstate.startikey */
+			/* RowCompare inequalities currently aren't supported */
+			break;				/* "unsafe" */
 		}
 		if (key->sk_strategy != BTEqualStrategyNumber)
 		{
@@ -3078,6 +2940,31 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
 
 		Assert(subkey->sk_flags & SK_ROW_MEMBER);
 
+		/* When a NULL row member is compared, the row never matches */
+		if (subkey->sk_flags & SK_ISNULL)
+		{
+			/*
+			 * Unlike the simple-scankey case, this isn't a disallowed case
+			 * (except when it's the first row element that has the NULL arg).
+			 * But it can never match.  If all the earlier row comparison
+			 * columns are required for the scan direction, we can stop the
+			 * scan, because there can't be another tuple that will succeed.
+			 */
+			Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument));
+			subkey--;
+			if (forcenonrequired)
+			{
+				/* treating scan's keys as non-required */
+			}
+			else if ((subkey->sk_flags & SK_BT_REQFWD) &&
+					 ScanDirectionIsForward(dir))
+				*continuescan = false;
+			else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
+					 ScanDirectionIsBackward(dir))
+				*continuescan = false;
+			return false;
+		}
+
 		if (subkey->sk_attno > tupnatts)
 		{
 			/*
@@ -3087,11 +2974,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
 			 * attribute passes the qual.
 			 */
 			Assert(BTreeTupleIsPivot(tuple));
-			cmpresult = 0;
-			if (subkey->sk_flags & SK_ROW_END)
-				break;
-			subkey++;
-			continue;
+			return true;
 		}
 
 		datum = index_getattr(tuple,
@@ -3101,6 +2984,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
 
 		if (isNull)
 		{
+			int			reqflags;
+
 			if (forcenonrequired)
 			{
 				/* treating scan's keys as non-required */
@@ -3111,15 +2996,35 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
 				 * Since NULLs are sorted before non-NULLs, we know we have
 				 * reached the lower limit of the range of values for this
 				 * index attr.  On a backward scan, we can stop if this qual
-				 * is one of the "must match" subset.  We can stop regardless
-				 * of whether the qual is > or <, so long as it's required,
-				 * because it's not possible for any future tuples to pass. On
-				 * a forward scan, however, we must keep going, because we may
-				 * have initially positioned to the start of the index.
-				 * (_bt_advance_array_keys also relies on this behavior during
-				 * forward scans.)
+				 * is one of the "must match" subset.  However, on a forwards
+				 * scan, we must keep going, because we may have initially
+				 * positioned to the start of the index.
+				 *
+				 * All required NULLS FIRST > row members can use NULL tuple
+				 * values to end backwards scans, just like with other values.
+				 * A qual "WHERE (a, b, c) > (9, 42, 'foo')" can terminate a
+				 * backwards scan upon reaching the index's rightmost "a = 9"
+				 * tuple whose "b" column contains a NULL (if not sooner).
+				 * Since "b" is NULLS FIRST, we can treat its NULLs as "<" 42.
 				 */
-				if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
+				reqflags = SK_BT_REQBKWD;
+
+				/*
+				 * When a most significant required NULLS FIRST < row compare
+				 * member sees NULL tuple values during a backwards scan, it
+				 * signals the end of matches for the whole row compare/scan.
+				 * A qual "WHERE (a, b, c) < (9, 42, 'foo')" will terminate a
+				 * backwards scan upon reaching the rightmost tuple whose "a"
+				 * column has a NULL.  The "a" NULL value is "<" 9, and yet
+				 * our < row compare will still end the scan.  (This isn't
+				 * safe with later/lower-order row members.  Notice that it
+				 * can only happen with an "a" NULL some time after the scan
+				 * completely stops needing to use its "b" and "c" members.)
+				 */
+				if (subkey == (ScanKey) DatumGetPointer(skey->sk_argument))
+					reqflags |= SK_BT_REQFWD;	/* safe, first row member */
+
+				if ((subkey->sk_flags & reqflags) &&
 					ScanDirectionIsBackward(dir))
 					*continuescan = false;
 			}
@@ -3129,15 +3034,35 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
 				 * Since NULLs are sorted after non-NULLs, we know we have
 				 * reached the upper limit of the range of values for this
 				 * index attr.  On a forward scan, we can stop if this qual is
-				 * one of the "must match" subset.  We can stop regardless of
-				 * whether the qual is > or <, so long as it's required,
-				 * because it's not possible for any future tuples to pass. On
-				 * a backward scan, however, we must keep going, because we
-				 * may have initially positioned to the end of the index.
-				 * (_bt_advance_array_keys also relies on this behavior during
-				 * backward scans.)
+				 * one of the "must match" subset.  However, on a backward
+				 * scan, we must keep going, because we may have initially
+				 * positioned to the end of the index.
+				 *
+				 * All required NULLS LAST < row members can use NULL tuple
+				 * values to end forwards scans, just like with other values.
+				 * A qual "WHERE (a, b, c) < (9, 42, 'foo')" can terminate a
+				 * forwards scan upon reaching the index's leftmost "a = 9"
+				 * tuple whose "b" column contains a NULL (if not sooner).
+				 * Since "b" is NULLS LAST, we can treat its NULLs as ">" 42.
+				 */
+				reqflags = SK_BT_REQFWD;
+
+				/*
+				 * When a most significant required NULLS LAST > row compare
+				 * member sees NULL tuple values during a forwards scan, it
+				 * signals the end of matches for the whole row compare/scan.
+				 * A qual "WHERE (a, b, c) > (9, 42, 'foo')" will terminate a
+				 * forwards scan upon reaching the leftmost tuple whose "a"
+				 * column has a NULL.  The "a" NULL value is ">" 9, and yet
+				 * our > row compare will end the scan.  (This isn't safe with
+				 * later/lower-order row members.  Notice that it can only
+				 * happen with an "a" NULL some time after the scan completely
+				 * stops needing to use its "b" and "c" members.)
 				 */
-				if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
+				if (subkey == (ScanKey) DatumGetPointer(skey->sk_argument))
+					reqflags |= SK_BT_REQBKWD;	/* safe, first row member */
+
+				if ((subkey->sk_flags & reqflags) &&
 					ScanDirectionIsForward(dir))
 					*continuescan = false;
 			}
@@ -3148,30 +3073,6 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
 			return false;
 		}
 
-		if (subkey->sk_flags & SK_ISNULL)
-		{
-			/*
-			 * Unlike the simple-scankey case, this isn't a disallowed case
-			 * (except when it's the first row element that has the NULL arg).
-			 * But it can never match.  If all the earlier row comparison
-			 * columns are required for the scan direction, we can stop the
-			 * scan, because there can't be another tuple that will succeed.
-			 */
-			Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument));
-			subkey--;
-			if (forcenonrequired)
-			{
-				/* treating scan's keys as non-required */
-			}
-			else if ((subkey->sk_flags & SK_BT_REQFWD) &&
-					 ScanDirectionIsForward(dir))
-				*continuescan = false;
-			else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
-					 ScanDirectionIsBackward(dir))
-				*continuescan = false;
-			return false;
-		}
-
 		/* Perform the test --- three-way comparison not bool operator */
 		cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func,
 													subkey->sk_collation,
@@ -3330,87 +3231,85 @@ _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
  * current page and killed tuples thereon (generally, this should only be
  * called if so->numKilled > 0).
  *
- * The caller does not have a lock on the page and may or may not have the
- * page pinned in a buffer.  Note that read-lock is sufficient for setting
- * LP_DEAD status (which is only a hint).
- *
- * We match items by heap TID before assuming they are the right ones to
- * delete.  We cope with cases where items have moved right due to insertions.
- * If an item has moved off the current page due to a split, we'll fail to
- * find it and do nothing (this is not an error case --- we assume the item
- * will eventually get marked in a future indexscan).
+ * Caller should not have a lock on the so->currPos page, but must hold a
+ * buffer pin when !so->dropPin.  When we return, it still won't be locked.
+ * It'll continue to hold whatever pins were held before calling here.
  *
- * Note that if we hold a pin on the target page continuously from initially
- * reading the items until applying this function, VACUUM cannot have deleted
- * any items from the page, and so there is no need to search left from the
- * recorded offset.  (This observation also guarantees that the item is still
- * the right one to delete, which might otherwise be questionable since heap
- * TIDs can get recycled.)	This holds true even if the page has been modified
- * by inserts and page splits, so there is no need to consult the LSN.
+ * We match items by heap TID before assuming they are the right ones to set
+ * LP_DEAD.  If the scan is one that holds a buffer pin on the target page
+ * continuously from initially reading the items until applying this function
+ * (if it is a !so->dropPin scan), VACUUM cannot have deleted any items on the
+ * page, so the page's TIDs can't have been recycled by now.  There's no risk
+ * that we'll confuse a new index tuple that happens to use a recycled TID
+ * with a now-removed tuple with the same TID (that used to be on this same
+ * page).  We can't rely on that during scans that drop buffer pins eagerly
+ * (so->dropPin scans), though, so we must condition setting LP_DEAD bits on
+ * the page LSN having not changed since back when _bt_readpage saw the page.
+ * We totally give up on setting LP_DEAD bits when the page LSN changed.
  *
- * If the pin was released after reading the page, then we re-read it.  If it
- * has been modified since we read it (as determined by the LSN), we dare not
- * flag any entries because it is possible that the old entry was vacuumed
- * away and the TID was re-used by a completely different heap tuple.
+ * We give up much less often during !so->dropPin scans, but it still happens.
+ * We cope with cases where items have moved right due to insertions.  If an
+ * item has moved off the current page due to a split, we'll fail to find it
+ * and just give up on it.
  */
 void
 _bt_killitems(IndexScanDesc scan)
 {
+	Relation	rel = scan->indexRelation;
 	BTScanOpaque so = (BTScanOpaque) scan->opaque;
 	Page		page;
 	BTPageOpaque opaque;
 	OffsetNumber minoff;
 	OffsetNumber maxoff;
-	int			i;
 	int			numKilled = so->numKilled;
 	bool		killedsomething = false;
-	bool		droppedpin PG_USED_FOR_ASSERTS_ONLY;
+	Buffer		buf;
 
+	Assert(numKilled > 0);
 	Assert(BTScanPosIsValid(so->currPos));
+	Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */
 
-	/*
-	 * Always reset the scan state, so we don't look for same items on other
-	 * pages.
-	 */
+	/* Always invalidate so->killedItems[] before leaving so->currPos */
 	so->numKilled = 0;
 
-	if (BTScanPosIsPinned(so->currPos))
+	if (!so->dropPin)
 	{
 		/*
 		 * We have held the pin on this page since we read the index tuples,
 		 * so all we need to do is lock it.  The pin will have prevented
-		 * re-use of any TID on the page, so there is no need to check the
-		 * LSN.
+		 * concurrent VACUUMs from recycling any of the TIDs on the page.
 		 */
-		droppedpin = false;
-		_bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
-
-		page = BufferGetPage(so->currPos.buf);
+		Assert(BTScanPosIsPinned(so->currPos));
+		buf = so->currPos.buf;
+		_bt_lockbuf(rel, buf, BT_READ);
 	}
 	else
 	{
-		Buffer		buf;
+		XLogRecPtr	latestlsn;
 
-		droppedpin = true;
-		/* Attempt to re-read the buffer, getting pin and lock. */
-		buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ);
+		Assert(!BTScanPosIsPinned(so->currPos));
+		Assert(RelationNeedsWAL(rel));
+		buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
 
-		page = BufferGetPage(buf);
-		if (BufferGetLSNAtomic(buf) == so->currPos.lsn)
-			so->currPos.buf = buf;
-		else
+		latestlsn = BufferGetLSNAtomic(buf);
+		Assert(!XLogRecPtrIsInvalid(so->currPos.lsn));
+		Assert(so->currPos.lsn <= latestlsn);
+		if (so->currPos.lsn != latestlsn)
 		{
-			/* Modified while not pinned means hinting is not safe. */
-			_bt_relbuf(scan->indexRelation, buf);
+			/* Modified, give up on hinting */
+			_bt_relbuf(rel, buf);
 			return;
 		}
+
+		/* Unmodified, hinting is safe */
 	}
 
+	page = BufferGetPage(buf);
 	opaque = BTPageGetOpaque(page);
 	minoff = P_FIRSTDATAKEY(opaque);
 	maxoff = PageGetMaxOffsetNumber(page);
 
-	for (i = 0; i < numKilled; i++)
+	for (int i = 0; i < numKilled; i++)
 	{
 		int			itemIndex = so->killedItems[i];
 		BTScanPosItem *kitem = &so->currPos.items[itemIndex];
@@ -3442,7 +3341,7 @@ _bt_killitems(IndexScanDesc scan)
 				 * correctness.
 				 *
 				 * Note that the page may have been modified in almost any way
-				 * since we first read it (in the !droppedpin case), so it's
+				 * since we first read it (in the !so->dropPin case), so it's
 				 * possible that this posting list tuple wasn't a posting list
 				 * tuple when we first encountered its heap TIDs.
 				 */
@@ -3458,7 +3357,7 @@ _bt_killitems(IndexScanDesc scan)
 					 * though only in the common case where the page can't
 					 * have been concurrently modified
 					 */
-					Assert(kitem->indexOffset == offnum || !droppedpin);
+					Assert(kitem->indexOffset == offnum || !so->dropPin);
 
 					/*
 					 * Read-ahead to later kitems here.
@@ -3522,10 +3421,13 @@ _bt_killitems(IndexScanDesc scan)
 	if (killedsomething)
 	{
 		opaque->btpo_flags |= BTP_HAS_GARBAGE;
-		MarkBufferDirtyHint(so->currPos.buf, true);
+		MarkBufferDirtyHint(buf, true);
 	}
 
-	_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
+	if (!so->dropPin)
+		_bt_unlockbuf(rel, buf);
+	else
+		_bt_relbuf(rel, buf);
 }
 
 
diff --git a/src/backend/access/rmgrdesc/replorigindesc.c b/src/backend/access/rmgrdesc/replorigindesc.c
index 5dd74233996..35e3af2903e 100644
--- a/src/backend/access/rmgrdesc/replorigindesc.c
+++ b/src/backend/access/rmgrdesc/replorigindesc.c
@@ -29,7 +29,7 @@ replorigin_desc(StringInfo buf, XLogReaderState *record)
 
 				xlrec = (xl_replorigin_set *) rec;
 
-				appendStringInfo(buf, "set %u; lsn %X/%X; force: %d",
+				appendStringInfo(buf, "set %u; lsn %X/%08X; force: %d",
 								 xlrec->node_id,
 								 LSN_FORMAT_ARGS(xlrec->remote_lsn),
 								 xlrec->force);
diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c
index 305598e2865..f0f696855b9 100644
--- a/src/backend/access/rmgrdesc/xactdesc.c
+++ b/src/backend/access/rmgrdesc/xactdesc.c
@@ -359,7 +359,7 @@ xact_desc_commit(StringInfo buf, uint8 info, xl_xact_commit *xlrec, RepOriginId
 
 	if (parsed.xinfo & XACT_XINFO_HAS_ORIGIN)
 	{
-		appendStringInfo(buf, "; origin: node %u, lsn %X/%X, at %s",
+		appendStringInfo(buf, "; origin: node %u, lsn %X/%08X, at %s",
 						 origin_id,
 						 LSN_FORMAT_ARGS(parsed.origin_lsn),
 						 timestamptz_to_str(parsed.origin_timestamp));
@@ -384,7 +384,7 @@ xact_desc_abort(StringInfo buf, uint8 info, xl_xact_abort *xlrec, RepOriginId or
 
 	if (parsed.xinfo & XACT_XINFO_HAS_ORIGIN)
 	{
-		appendStringInfo(buf, "; origin: node %u, lsn %X/%X, at %s",
+		appendStringInfo(buf, "; origin: node %u, lsn %X/%08X, at %s",
 						 origin_id,
 						 LSN_FORMAT_ARGS(parsed.origin_lsn),
 						 timestamptz_to_str(parsed.origin_timestamp));
@@ -418,7 +418,7 @@ xact_desc_prepare(StringInfo buf, uint8 info, xl_xact_prepare *xlrec, RepOriginI
 	 * way as PrepareRedoAdd().
 	 */
 	if (origin_id != InvalidRepOriginId)
-		appendStringInfo(buf, "; origin: node %u, lsn %X/%X, at %s",
+		appendStringInfo(buf, "; origin: node %u, lsn %X/%08X, at %s",
 						 origin_id,
 						 LSN_FORMAT_ARGS(parsed.origin_lsn),
 						 timestamptz_to_str(parsed.origin_timestamp));
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index 58040f28656..cd6c2a2f650 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -65,7 +65,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 	{
 		CheckPoint *checkpoint = (CheckPoint *) rec;
 
-		appendStringInfo(buf, "redo %X/%X; "
+		appendStringInfo(buf, "redo %X/%08X; "
 						 "tli %u; prev tli %u; fpw %s; wal_level %s; xid %u:%u; oid %u; multi %u; offset %u; "
 						 "oldest xid %u in DB %u; oldest multi %u in DB %u; "
 						 "oldest/newest commit timestamp xid: %u/%u; "
@@ -111,7 +111,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 		XLogRecPtr	startpoint;
 
 		memcpy(&startpoint, rec, sizeof(XLogRecPtr));
-		appendStringInfo(buf, "%X/%X", LSN_FORMAT_ARGS(startpoint));
+		appendStringInfo(buf, "%X/%08X", LSN_FORMAT_ARGS(startpoint));
 	}
 	else if (info == XLOG_PARAMETER_CHANGE)
 	{
@@ -156,7 +156,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 		xl_overwrite_contrecord xlrec;
 
 		memcpy(&xlrec, rec, sizeof(xl_overwrite_contrecord));
-		appendStringInfo(buf, "lsn %X/%X; time %s",
+		appendStringInfo(buf, "lsn %X/%08X; time %s",
 						 LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
 						 timestamptz_to_str(xlrec.overwrite_time));
 	}
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index 113fae1437a..225ff7ca9f2 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -707,6 +707,13 @@ ActivateCommitTs(void)
 	TransactionId xid;
 	int64		pageno;
 
+	/*
+	 * During bootstrap, we should not register commit timestamps so skip the
+	 * activation in this case.
+	 */
+	if (IsBootstrapProcessingMode())
+		return;
+
 	/* If we've done this already, there's nothing to do */
 	LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
 	if (commitTsShared->commitTsActive)
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 3c06ac45532..7a7afe3edc6 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -1847,7 +1847,7 @@ AtPrepare_MultiXact(void)
  *		Clean up after successful PREPARE TRANSACTION
  */
 void
-PostPrepare_MultiXact(TransactionId xid)
+PostPrepare_MultiXact(FullTransactionId fxid)
 {
 	MultiXactId myOldestMember;
 
@@ -1858,7 +1858,7 @@ PostPrepare_MultiXact(TransactionId xid)
 	myOldestMember = OldestMemberMXactId[MyProcNumber];
 	if (MultiXactIdIsValid(myOldestMember))
 	{
-		ProcNumber	dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false);
+		ProcNumber	dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false);
 
 		/*
 		 * Even though storing MultiXactId is atomic, acquire lock to make
@@ -1896,10 +1896,10 @@ PostPrepare_MultiXact(TransactionId xid)
  *		Recover the state of a prepared transaction at startup
  */
 void
-multixact_twophase_recover(TransactionId xid, uint16 info,
+multixact_twophase_recover(FullTransactionId fxid, uint16 info,
 						   void *recdata, uint32 len)
 {
-	ProcNumber	dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false);
+	ProcNumber	dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, false);
 	MultiXactId oldestMember;
 
 	/*
@@ -1917,10 +1917,10 @@ multixact_twophase_recover(TransactionId xid, uint16 info,
  *		Similar to AtEOXact_MultiXact but for COMMIT PREPARED
  */
 void
-multixact_twophase_postcommit(TransactionId xid, uint16 info,
+multixact_twophase_postcommit(FullTransactionId fxid, uint16 info,
 							  void *recdata, uint32 len)
 {
-	ProcNumber	dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, true);
+	ProcNumber	dummyProcNumber = TwoPhaseGetDummyProcNumber(fxid, true);
 
 	Assert(len == sizeof(MultiXactId));
 
@@ -1932,10 +1932,10 @@ multixact_twophase_postcommit(TransactionId xid, uint16 info,
  *		This is actually just the same as the COMMIT case.
  */
 void
-multixact_twophase_postabort(TransactionId xid, uint16 info,
+multixact_twophase_postabort(FullTransactionId fxid, uint16 info,
 							 void *recdata, uint32 len)
 {
-	multixact_twophase_postcommit(xid, info, recdata, len);
+	multixact_twophase_postcommit(fxid, info, recdata, len);
 }
 
 /*
diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c
index a27f27cc037..186eb91f609 100644
--- a/src/backend/access/transam/timeline.c
+++ b/src/backend/access/transam/timeline.c
@@ -154,7 +154,7 @@ readTimeLineHistory(TimeLineID targetTLI)
 		if (*ptr == '\0' || *ptr == '#')
 			continue;
 
-		nfields = sscanf(fline, "%u\t%X/%X", &tli, &switchpoint_hi, &switchpoint_lo);
+		nfields = sscanf(fline, "%u\t%X/%08X", &tli, &switchpoint_hi, &switchpoint_lo);
 
 		if (nfields < 1)
 		{
@@ -399,7 +399,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 	 * parent file failed to end with one.
 	 */
 	snprintf(buffer, sizeof(buffer),
-			 "%s%u\t%X/%X\t%s\n",
+			 "%s%u\t%X/%08X\t%s\n",
 			 (srcfd < 0) ? "" : "\n",
 			 parentTLI,
 			 LSN_FORMAT_ARGS(switchpoint),
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 73a80559194..85cbe397cb2 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -159,7 +159,7 @@ typedef struct GlobalTransactionData
 	 */
 	XLogRecPtr	prepare_start_lsn;	/* XLOG offset of prepare record start */
 	XLogRecPtr	prepare_end_lsn;	/* XLOG offset of prepare record end */
-	TransactionId xid;			/* The GXACT id */
+	FullTransactionId fxid;		/* The GXACT full xid */
 
 	Oid			owner;			/* ID of user that executed the xact */
 	ProcNumber	locking_backend;	/* backend currently working on the xact */
@@ -197,6 +197,7 @@ static GlobalTransaction MyLockedGxact = NULL;
 
 static bool twophaseExitRegistered = false;
 
+static void PrepareRedoRemoveFull(FullTransactionId fxid, bool giveWarning);
 static void RecordTransactionCommitPrepared(TransactionId xid,
 											int nchildren,
 											TransactionId *children,
@@ -216,19 +217,19 @@ static void RecordTransactionAbortPrepared(TransactionId xid,
 										   int nstats,
 										   xl_xact_stats_item *stats,
 										   const char *gid);
-static void ProcessRecords(char *bufptr, TransactionId xid,
+static void ProcessRecords(char *bufptr, FullTransactionId fxid,
 						   const TwoPhaseCallback callbacks[]);
 static void RemoveGXact(GlobalTransaction gxact);
 
 static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len);
-static char *ProcessTwoPhaseBuffer(TransactionId xid,
+static char *ProcessTwoPhaseBuffer(FullTransactionId fxid,
 								   XLogRecPtr prepare_start_lsn,
 								   bool fromdisk, bool setParent, bool setNextXid);
-static void MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid,
+static void MarkAsPreparingGuts(GlobalTransaction gxact, FullTransactionId fxid,
 								const char *gid, TimestampTz prepared_at, Oid owner,
 								Oid databaseid);
-static void RemoveTwoPhaseFile(TransactionId xid, bool giveWarning);
-static void RecreateTwoPhaseFile(TransactionId xid, void *content, int len);
+static void RemoveTwoPhaseFile(FullTransactionId fxid, bool giveWarning);
+static void RecreateTwoPhaseFile(FullTransactionId fxid, void *content, int len);
 
 /*
  * Initialization of shared memory
@@ -356,7 +357,7 @@ PostPrepare_Twophase(void)
  *		Reserve the GID for the given transaction.
  */
 GlobalTransaction
-MarkAsPreparing(TransactionId xid, const char *gid,
+MarkAsPreparing(FullTransactionId fxid, const char *gid,
 				TimestampTz prepared_at, Oid owner, Oid databaseid)
 {
 	GlobalTransaction gxact;
@@ -407,7 +408,7 @@ MarkAsPreparing(TransactionId xid, const char *gid,
 	gxact = TwoPhaseState->freeGXacts;
 	TwoPhaseState->freeGXacts = gxact->next;
 
-	MarkAsPreparingGuts(gxact, xid, gid, prepared_at, owner, databaseid);
+	MarkAsPreparingGuts(gxact, fxid, gid, prepared_at, owner, databaseid);
 
 	gxact->ondisk = false;
 
@@ -430,11 +431,13 @@ MarkAsPreparing(TransactionId xid, const char *gid,
  * Note: This function should be called with appropriate locks held.
  */
 static void
-MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
-					TimestampTz prepared_at, Oid owner, Oid databaseid)
+MarkAsPreparingGuts(GlobalTransaction gxact, FullTransactionId fxid,
+					const char *gid, TimestampTz prepared_at, Oid owner,
+					Oid databaseid)
 {
 	PGPROC	   *proc;
 	int			i;
+	TransactionId xid = XidFromFullTransactionId(fxid);
 
 	Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
 
@@ -479,7 +482,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
 	proc->subxidStatus.count = 0;
 
 	gxact->prepared_at = prepared_at;
-	gxact->xid = xid;
+	gxact->fxid = fxid;
 	gxact->owner = owner;
 	gxact->locking_backend = MyProcNumber;
 	gxact->valid = false;
@@ -797,12 +800,12 @@ pg_prepared_xact(PG_FUNCTION_ARGS)
  * caller had better hold it.
  */
 static GlobalTransaction
-TwoPhaseGetGXact(TransactionId xid, bool lock_held)
+TwoPhaseGetGXact(FullTransactionId fxid, bool lock_held)
 {
 	GlobalTransaction result = NULL;
 	int			i;
 
-	static TransactionId cached_xid = InvalidTransactionId;
+	static FullTransactionId cached_fxid = {InvalidTransactionId};
 	static GlobalTransaction cached_gxact = NULL;
 
 	Assert(!lock_held || LWLockHeldByMe(TwoPhaseStateLock));
@@ -811,7 +814,7 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held)
 	 * During a recovery, COMMIT PREPARED, or ABORT PREPARED, we'll be called
 	 * repeatedly for the same XID.  We can save work with a simple cache.
 	 */
-	if (xid == cached_xid)
+	if (FullTransactionIdEquals(fxid, cached_fxid))
 		return cached_gxact;
 
 	if (!lock_held)
@@ -821,7 +824,7 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held)
 	{
 		GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
 
-		if (gxact->xid == xid)
+		if (FullTransactionIdEquals(gxact->fxid, fxid))
 		{
 			result = gxact;
 			break;
@@ -832,9 +835,10 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held)
 		LWLockRelease(TwoPhaseStateLock);
 
 	if (result == NULL)			/* should not happen */
-		elog(ERROR, "failed to find GlobalTransaction for xid %u", xid);
+		elog(ERROR, "failed to find GlobalTransaction for xid %u",
+			 XidFromFullTransactionId(fxid));
 
-	cached_xid = xid;
+	cached_fxid = fxid;
 	cached_gxact = result;
 
 	return result;
@@ -881,7 +885,7 @@ TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid,
 				*have_more = true;
 				break;
 			}
-			result = gxact->xid;
+			result = XidFromFullTransactionId(gxact->fxid);
 		}
 	}
 
@@ -892,7 +896,7 @@ TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid,
 
 /*
  * TwoPhaseGetDummyProcNumber
- *		Get the dummy proc number for prepared transaction specified by XID
+ *		Get the dummy proc number for prepared transaction
  *
  * Dummy proc numbers are similar to proc numbers of real backends.  They
  * start at MaxBackends, and are unique across all currently active real
@@ -900,24 +904,24 @@ TwoPhaseGetXidByVirtualXID(VirtualTransactionId vxid,
  * TwoPhaseStateLock will not be taken, so the caller had better hold it.
  */
 ProcNumber
-TwoPhaseGetDummyProcNumber(TransactionId xid, bool lock_held)
+TwoPhaseGetDummyProcNumber(FullTransactionId fxid, bool lock_held)
 {
-	GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held);
+	GlobalTransaction gxact = TwoPhaseGetGXact(fxid, lock_held);
 
 	return gxact->pgprocno;
 }
 
 /*
  * TwoPhaseGetDummyProc
- *		Get the PGPROC that represents a prepared transaction specified by XID
+ *		Get the PGPROC that represents a prepared transaction
  *
  * If lock_held is set to true, TwoPhaseStateLock will not be taken, so the
  * caller had better hold it.
  */
 PGPROC *
-TwoPhaseGetDummyProc(TransactionId xid, bool lock_held)
+TwoPhaseGetDummyProc(FullTransactionId fxid, bool lock_held)
 {
-	GlobalTransaction gxact = TwoPhaseGetGXact(xid, lock_held);
+	GlobalTransaction gxact = TwoPhaseGetGXact(fxid, lock_held);
 
 	return GetPGProcByNumber(gxact->pgprocno);
 }
@@ -942,10 +946,8 @@ AdjustToFullTransactionId(TransactionId xid)
 }
 
 static inline int
-TwoPhaseFilePath(char *path, TransactionId xid)
+TwoPhaseFilePath(char *path, FullTransactionId fxid)
 {
-	FullTransactionId fxid = AdjustToFullTransactionId(xid);
-
 	return snprintf(path, MAXPGPATH, TWOPHASE_DIR "/%08X%08X",
 					EpochFromFullTransactionId(fxid),
 					XidFromFullTransactionId(fxid));
@@ -1049,7 +1051,7 @@ void
 StartPrepare(GlobalTransaction gxact)
 {
 	PGPROC	   *proc = GetPGProcByNumber(gxact->pgprocno);
-	TransactionId xid = gxact->xid;
+	TransactionId xid = XidFromFullTransactionId(gxact->fxid);
 	TwoPhaseFileHeader hdr;
 	TransactionId *children;
 	RelFileLocator *commitrels;
@@ -1281,10 +1283,11 @@ RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info,
  * If it looks OK (has a valid magic number and CRC), return the palloc'd
  * contents of the file, issuing an error when finding corrupted data.  If
  * missing_ok is true, which indicates that missing files can be safely
- * ignored, then return NULL.  This state can be reached when doing recovery.
+ * ignored, then return NULL.  This state can be reached when doing recovery
+ * after discarding two-phase files from frozen epochs.
  */
 static char *
-ReadTwoPhaseFile(TransactionId xid, bool missing_ok)
+ReadTwoPhaseFile(FullTransactionId fxid, bool missing_ok)
 {
 	char		path[MAXPGPATH];
 	char	   *buf;
@@ -1296,7 +1299,7 @@ ReadTwoPhaseFile(TransactionId xid, bool missing_ok)
 				file_crc;
 	int			r;
 
-	TwoPhaseFilePath(path, xid);
+	TwoPhaseFilePath(path, fxid);
 
 	fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
 	if (fd < 0)
@@ -1426,12 +1429,12 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len)
 		if (errormsg)
 			ereport(ERROR,
 					(errcode_for_file_access(),
-					 errmsg("could not read two-phase state from WAL at %X/%X: %s",
+					 errmsg("could not read two-phase state from WAL at %X/%08X: %s",
 							LSN_FORMAT_ARGS(lsn), errormsg)));
 		else
 			ereport(ERROR,
 					(errcode_for_file_access(),
-					 errmsg("could not read two-phase state from WAL at %X/%X",
+					 errmsg("could not read two-phase state from WAL at %X/%08X",
 							LSN_FORMAT_ARGS(lsn))));
 	}
 
@@ -1439,7 +1442,7 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len)
 		(XLogRecGetInfo(xlogreader) & XLOG_XACT_OPMASK) != XLOG_XACT_PREPARE)
 		ereport(ERROR,
 				(errcode_for_file_access(),
-				 errmsg("expected two-phase state data is not present in WAL at %X/%X",
+				 errmsg("expected two-phase state data is not present in WAL at %X/%08X",
 						LSN_FORMAT_ARGS(lsn))));
 
 	if (len != NULL)
@@ -1461,6 +1464,7 @@ StandbyTransactionIdIsPrepared(TransactionId xid)
 	char	   *buf;
 	TwoPhaseFileHeader *hdr;
 	bool		result;
+	FullTransactionId fxid;
 
 	Assert(TransactionIdIsValid(xid));
 
@@ -1468,7 +1472,8 @@ StandbyTransactionIdIsPrepared(TransactionId xid)
 		return false;			/* nothing to do */
 
 	/* Read and validate file */
-	buf = ReadTwoPhaseFile(xid, true);
+	fxid = AdjustToFullTransactionId(xid);
+	buf = ReadTwoPhaseFile(fxid, true);
 	if (buf == NULL)
 		return false;
 
@@ -1488,6 +1493,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 {
 	GlobalTransaction gxact;
 	PGPROC	   *proc;
+	FullTransactionId fxid;
 	TransactionId xid;
 	bool		ondisk;
 	char	   *buf;
@@ -1509,7 +1515,8 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 	 */
 	gxact = LockGXact(gid, GetUserId());
 	proc = GetPGProcByNumber(gxact->pgprocno);
-	xid = gxact->xid;
+	fxid = gxact->fxid;
+	xid = XidFromFullTransactionId(fxid);
 
 	/*
 	 * Read and validate 2PC state data. State data will typically be stored
@@ -1517,7 +1524,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 	 * to disk if for some reason they have lived for a long time.
 	 */
 	if (gxact->ondisk)
-		buf = ReadTwoPhaseFile(xid, false);
+		buf = ReadTwoPhaseFile(fxid, false);
 	else
 		XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL);
 
@@ -1636,11 +1643,11 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 
 	/* And now do the callbacks */
 	if (isCommit)
-		ProcessRecords(bufptr, xid, twophase_postcommit_callbacks);
+		ProcessRecords(bufptr, fxid, twophase_postcommit_callbacks);
 	else
-		ProcessRecords(bufptr, xid, twophase_postabort_callbacks);
+		ProcessRecords(bufptr, fxid, twophase_postabort_callbacks);
 
-	PredicateLockTwoPhaseFinish(xid, isCommit);
+	PredicateLockTwoPhaseFinish(fxid, isCommit);
 
 	/*
 	 * Read this value while holding the two-phase lock, as the on-disk 2PC
@@ -1664,7 +1671,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 	 * And now we can clean up any files we may have left.
 	 */
 	if (ondisk)
-		RemoveTwoPhaseFile(xid, true);
+		RemoveTwoPhaseFile(fxid, true);
 
 	MyLockedGxact = NULL;
 
@@ -1677,7 +1684,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
  * Scan 2PC state data in memory and call the indicated callbacks for each 2PC record.
  */
 static void
-ProcessRecords(char *bufptr, TransactionId xid,
+ProcessRecords(char *bufptr, FullTransactionId fxid,
 			   const TwoPhaseCallback callbacks[])
 {
 	for (;;)
@@ -1691,24 +1698,28 @@ ProcessRecords(char *bufptr, TransactionId xid,
 		bufptr += MAXALIGN(sizeof(TwoPhaseRecordOnDisk));
 
 		if (callbacks[record->rmid] != NULL)
-			callbacks[record->rmid] (xid, record->info, bufptr, record->len);
+			callbacks[record->rmid] (fxid, record->info, bufptr, record->len);
 
 		bufptr += MAXALIGN(record->len);
 	}
 }
 
 /*
- * Remove the 2PC file for the specified XID.
+ * Remove the 2PC file.
  *
  * If giveWarning is false, do not complain about file-not-present;
  * this is an expected case during WAL replay.
+ *
+ * This routine is used at early stages at recovery where future and
+ * past orphaned files are checked, hence the FullTransactionId to build
+ * a complete file name fit for the removal.
  */
 static void
-RemoveTwoPhaseFile(TransactionId xid, bool giveWarning)
+RemoveTwoPhaseFile(FullTransactionId fxid, bool giveWarning)
 {
 	char		path[MAXPGPATH];
 
-	TwoPhaseFilePath(path, xid);
+	TwoPhaseFilePath(path, fxid);
 	if (unlink(path))
 		if (errno != ENOENT || giveWarning)
 			ereport(WARNING,
@@ -1723,7 +1734,7 @@ RemoveTwoPhaseFile(TransactionId xid, bool giveWarning)
  * Note: content and len don't include CRC.
  */
 static void
-RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
+RecreateTwoPhaseFile(FullTransactionId fxid, void *content, int len)
 {
 	char		path[MAXPGPATH];
 	pg_crc32c	statefile_crc;
@@ -1734,7 +1745,7 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
 	COMP_CRC32C(statefile_crc, content, len);
 	FIN_CRC32C(statefile_crc);
 
-	TwoPhaseFilePath(path, xid);
+	TwoPhaseFilePath(path, fxid);
 
 	fd = OpenTransientFile(path,
 						   O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY);
@@ -1846,7 +1857,7 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
 			int			len;
 
 			XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, &len);
-			RecreateTwoPhaseFile(gxact->xid, buf, len);
+			RecreateTwoPhaseFile(gxact->fxid, buf, len);
 			gxact->ondisk = true;
 			gxact->prepare_start_lsn = InvalidXLogRecPtr;
 			gxact->prepare_end_lsn = InvalidXLogRecPtr;
@@ -1897,19 +1908,17 @@ restoreTwoPhaseData(void)
 		if (strlen(clde->d_name) == 16 &&
 			strspn(clde->d_name, "0123456789ABCDEF") == 16)
 		{
-			TransactionId xid;
 			FullTransactionId fxid;
 			char	   *buf;
 
 			fxid = FullTransactionIdFromU64(strtou64(clde->d_name, NULL, 16));
-			xid = XidFromFullTransactionId(fxid);
 
-			buf = ProcessTwoPhaseBuffer(xid, InvalidXLogRecPtr,
+			buf = ProcessTwoPhaseBuffer(fxid, InvalidXLogRecPtr,
 										true, false, false);
 			if (buf == NULL)
 				continue;
 
-			PrepareRedoAdd(buf, InvalidXLogRecPtr,
+			PrepareRedoAdd(fxid, buf, InvalidXLogRecPtr,
 						   InvalidXLogRecPtr, InvalidRepOriginId);
 		}
 	}
@@ -1968,9 +1977,7 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
 
 		Assert(gxact->inredo);
 
-		xid = gxact->xid;
-
-		buf = ProcessTwoPhaseBuffer(xid,
+		buf = ProcessTwoPhaseBuffer(gxact->fxid,
 									gxact->prepare_start_lsn,
 									gxact->ondisk, false, true);
 
@@ -1981,6 +1988,7 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
 		 * OK, we think this file is valid.  Incorporate xid into the
 		 * running-minimum result.
 		 */
+		xid = XidFromFullTransactionId(gxact->fxid);
 		if (TransactionIdPrecedes(xid, result))
 			result = xid;
 
@@ -2036,15 +2044,12 @@ StandbyRecoverPreparedTransactions(void)
 	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
 	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
 	{
-		TransactionId xid;
 		char	   *buf;
 		GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
 
 		Assert(gxact->inredo);
 
-		xid = gxact->xid;
-
-		buf = ProcessTwoPhaseBuffer(xid,
+		buf = ProcessTwoPhaseBuffer(gxact->fxid,
 									gxact->prepare_start_lsn,
 									gxact->ondisk, true, false);
 		if (buf != NULL)
@@ -2077,16 +2082,14 @@ RecoverPreparedTransactions(void)
 	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
 	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
 	{
-		TransactionId xid;
 		char	   *buf;
 		GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
+		FullTransactionId fxid = gxact->fxid;
 		char	   *bufptr;
 		TwoPhaseFileHeader *hdr;
 		TransactionId *subxids;
 		const char *gid;
 
-		xid = gxact->xid;
-
 		/*
 		 * Reconstruct subtrans state for the transaction --- needed because
 		 * pg_subtrans is not preserved over a restart.  Note that we are
@@ -2096,17 +2099,20 @@ RecoverPreparedTransactions(void)
 		 * SubTransSetParent has been set before, if the prepared transaction
 		 * generated xid assignment records.
 		 */
-		buf = ProcessTwoPhaseBuffer(xid,
+		buf = ProcessTwoPhaseBuffer(gxact->fxid,
 									gxact->prepare_start_lsn,
 									gxact->ondisk, true, false);
 		if (buf == NULL)
 			continue;
 
 		ereport(LOG,
-				(errmsg("recovering prepared transaction %u from shared memory", xid)));
+				(errmsg("recovering prepared transaction %u of epoch %u from shared memory",
+						XidFromFullTransactionId(gxact->fxid),
+						EpochFromFullTransactionId(gxact->fxid))));
 
 		hdr = (TwoPhaseFileHeader *) buf;
-		Assert(TransactionIdEquals(hdr->xid, xid));
+		Assert(TransactionIdEquals(hdr->xid,
+								   XidFromFullTransactionId(gxact->fxid)));
 		bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
 		gid = (const char *) bufptr;
 		bufptr += MAXALIGN(hdr->gidlen);
@@ -2122,7 +2128,7 @@ RecoverPreparedTransactions(void)
 		 * Recreate its GXACT and dummy PGPROC. But, check whether it was
 		 * added in redo and already has a shmem entry for it.
 		 */
-		MarkAsPreparingGuts(gxact, xid, gid,
+		MarkAsPreparingGuts(gxact, gxact->fxid, gid,
 							hdr->prepared_at,
 							hdr->owner, hdr->database);
 
@@ -2137,7 +2143,7 @@ RecoverPreparedTransactions(void)
 		/*
 		 * Recover other state (notably locks) using resource managers.
 		 */
-		ProcessRecords(bufptr, xid, twophase_recover_callbacks);
+		ProcessRecords(bufptr, fxid, twophase_recover_callbacks);
 
 		/*
 		 * Release locks held by the standby process after we process each
@@ -2145,7 +2151,7 @@ RecoverPreparedTransactions(void)
 		 * additional locks at any one time.
 		 */
 		if (InHotStandby)
-			StandbyReleaseLockTree(xid, hdr->nsubxacts, subxids);
+			StandbyReleaseLockTree(hdr->xid, hdr->nsubxacts, subxids);
 
 		/*
 		 * We're done with recovering this transaction. Clear MyLockedGxact,
@@ -2164,7 +2170,7 @@ RecoverPreparedTransactions(void)
 /*
  * ProcessTwoPhaseBuffer
  *
- * Given a transaction id, read it either from disk or read it directly
+ * Given a FullTransactionId, read it either from disk or read it directly
  * via shmem xlog record pointer using the provided "prepare_start_lsn".
  *
  * If setParent is true, set up subtransaction parent linkages.
@@ -2173,13 +2179,12 @@ RecoverPreparedTransactions(void)
  * value scanned.
  */
 static char *
-ProcessTwoPhaseBuffer(TransactionId xid,
+ProcessTwoPhaseBuffer(FullTransactionId fxid,
 					  XLogRecPtr prepare_start_lsn,
 					  bool fromdisk,
 					  bool setParent, bool setNextXid)
 {
 	FullTransactionId nextXid = TransamVariables->nextXid;
-	TransactionId origNextXid = XidFromFullTransactionId(nextXid);
 	TransactionId *subxids;
 	char	   *buf;
 	TwoPhaseFileHeader *hdr;
@@ -2191,41 +2196,46 @@ ProcessTwoPhaseBuffer(TransactionId xid,
 		Assert(prepare_start_lsn != InvalidXLogRecPtr);
 
 	/* Already processed? */
-	if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
+	if (TransactionIdDidCommit(XidFromFullTransactionId(fxid)) ||
+		TransactionIdDidAbort(XidFromFullTransactionId(fxid)))
 	{
 		if (fromdisk)
 		{
 			ereport(WARNING,
-					(errmsg("removing stale two-phase state file for transaction %u",
-							xid)));
-			RemoveTwoPhaseFile(xid, true);
+					(errmsg("removing stale two-phase state file for transaction %u of epoch %u",
+							XidFromFullTransactionId(fxid),
+							EpochFromFullTransactionId(fxid))));
+			RemoveTwoPhaseFile(fxid, true);
 		}
 		else
 		{
 			ereport(WARNING,
-					(errmsg("removing stale two-phase state from memory for transaction %u",
-							xid)));
-			PrepareRedoRemove(xid, true);
+					(errmsg("removing stale two-phase state from memory for transaction %u of epoch %u",
+							XidFromFullTransactionId(fxid),
+							EpochFromFullTransactionId(fxid))));
+			PrepareRedoRemoveFull(fxid, true);
 		}
 		return NULL;
 	}
 
 	/* Reject XID if too new */
-	if (TransactionIdFollowsOrEquals(xid, origNextXid))
+	if (FullTransactionIdFollowsOrEquals(fxid, nextXid))
 	{
 		if (fromdisk)
 		{
 			ereport(WARNING,
-					(errmsg("removing future two-phase state file for transaction %u",
-							xid)));
-			RemoveTwoPhaseFile(xid, true);
+					(errmsg("removing future two-phase state file for transaction %u of epoch %u",
+							XidFromFullTransactionId(fxid),
+							EpochFromFullTransactionId(fxid))));
+			RemoveTwoPhaseFile(fxid, true);
 		}
 		else
 		{
 			ereport(WARNING,
-					(errmsg("removing future two-phase state from memory for transaction %u",
-							xid)));
-			PrepareRedoRemove(xid, true);
+					(errmsg("removing future two-phase state from memory for transaction %u of epoch %u",
+							XidFromFullTransactionId(fxid),
+							EpochFromFullTransactionId(fxid))));
+			PrepareRedoRemoveFull(fxid, true);
 		}
 		return NULL;
 	}
@@ -2233,7 +2243,7 @@ ProcessTwoPhaseBuffer(TransactionId xid,
 	if (fromdisk)
 	{
 		/* Read and validate file */
-		buf = ReadTwoPhaseFile(xid, false);
+		buf = ReadTwoPhaseFile(fxid, false);
 	}
 	else
 	{
@@ -2243,18 +2253,20 @@ ProcessTwoPhaseBuffer(TransactionId xid,
 
 	/* Deconstruct header */
 	hdr = (TwoPhaseFileHeader *) buf;
-	if (!TransactionIdEquals(hdr->xid, xid))
+	if (!TransactionIdEquals(hdr->xid, XidFromFullTransactionId(fxid)))
 	{
 		if (fromdisk)
 			ereport(ERROR,
 					(errcode(ERRCODE_DATA_CORRUPTED),
-					 errmsg("corrupted two-phase state file for transaction %u",
-							xid)));
+					 errmsg("corrupted two-phase state file for transaction %u of epoch %u",
+							XidFromFullTransactionId(fxid),
+							EpochFromFullTransactionId(fxid))));
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_DATA_CORRUPTED),
-					 errmsg("corrupted two-phase state in memory for transaction %u",
-							xid)));
+					 errmsg("corrupted two-phase state in memory for transaction %u of epoch %u",
+							XidFromFullTransactionId(fxid),
+							EpochFromFullTransactionId(fxid))));
 	}
 
 	/*
@@ -2268,14 +2280,14 @@ ProcessTwoPhaseBuffer(TransactionId xid,
 	{
 		TransactionId subxid = subxids[i];
 
-		Assert(TransactionIdFollows(subxid, xid));
+		Assert(TransactionIdFollows(subxid, XidFromFullTransactionId(fxid)));
 
 		/* update nextXid if needed */
 		if (setNextXid)
 			AdvanceNextFullTransactionIdPastXid(subxid);
 
 		if (setParent)
-			SubTransSetParent(subxid, xid);
+			SubTransSetParent(subxid, XidFromFullTransactionId(fxid));
 	}
 
 	return buf;
@@ -2466,8 +2478,9 @@ RecordTransactionAbortPrepared(TransactionId xid,
  * data, the entry is marked as located on disk.
  */
 void
-PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
-			   XLogRecPtr end_lsn, RepOriginId origin_id)
+PrepareRedoAdd(FullTransactionId fxid, char *buf,
+			   XLogRecPtr start_lsn, XLogRecPtr end_lsn,
+			   RepOriginId origin_id)
 {
 	TwoPhaseFileHeader *hdr = (TwoPhaseFileHeader *) buf;
 	char	   *bufptr;
@@ -2477,6 +2490,13 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
 	Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
 	Assert(RecoveryInProgress());
 
+	if (!FullTransactionIdIsValid(fxid))
+	{
+		Assert(InRecovery);
+		fxid = FullTransactionIdFromAllowableAt(TransamVariables->nextXid,
+												hdr->xid);
+	}
+
 	bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader));
 	gid = (const char *) bufptr;
 
@@ -2505,14 +2525,15 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
 	{
 		char		path[MAXPGPATH];
 
-		TwoPhaseFilePath(path, hdr->xid);
+		Assert(InRecovery);
+		TwoPhaseFilePath(path, fxid);
 
 		if (access(path, F_OK) == 0)
 		{
 			ereport(reachedConsistency ? ERROR : WARNING,
 					(errmsg("could not recover two-phase state file for transaction %u",
 							hdr->xid),
-					 errdetail("Two-phase state file has been found in WAL record %X/%X, but this transaction has already been restored from disk.",
+					 errdetail("Two-phase state file has been found in WAL record %X/%08X, but this transaction has already been restored from disk.",
 							   LSN_FORMAT_ARGS(start_lsn))));
 			return;
 		}
@@ -2536,7 +2557,7 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
 	gxact->prepared_at = hdr->prepared_at;
 	gxact->prepare_start_lsn = start_lsn;
 	gxact->prepare_end_lsn = end_lsn;
-	gxact->xid = hdr->xid;
+	gxact->fxid = fxid;
 	gxact->owner = hdr->owner;
 	gxact->locking_backend = INVALID_PROC_NUMBER;
 	gxact->valid = false;
@@ -2555,11 +2576,13 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
 						   false /* backward */ , false /* WAL */ );
 	}
 
-	elog(DEBUG2, "added 2PC data in shared memory for transaction %u", gxact->xid);
+	elog(DEBUG2, "added 2PC data in shared memory for transaction %u of epoch %u",
+		 XidFromFullTransactionId(gxact->fxid),
+		 EpochFromFullTransactionId(gxact->fxid));
 }
 
 /*
- * PrepareRedoRemove
+ * PrepareRedoRemoveFull
  *
  * Remove the corresponding gxact entry from TwoPhaseState. Also remove
  * the 2PC file if a prepared transaction was saved via an earlier checkpoint.
@@ -2567,8 +2590,8 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
  * Caller must hold TwoPhaseStateLock in exclusive mode, because TwoPhaseState
  * is updated.
  */
-void
-PrepareRedoRemove(TransactionId xid, bool giveWarning)
+static void
+PrepareRedoRemoveFull(FullTransactionId fxid, bool giveWarning)
 {
 	GlobalTransaction gxact = NULL;
 	int			i;
@@ -2581,7 +2604,7 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning)
 	{
 		gxact = TwoPhaseState->prepXacts[i];
 
-		if (gxact->xid == xid)
+		if (FullTransactionIdEquals(gxact->fxid, fxid))
 		{
 			Assert(gxact->inredo);
 			found = true;
@@ -2598,13 +2621,29 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning)
 	/*
 	 * And now we can clean up any files we may have left.
 	 */
-	elog(DEBUG2, "removing 2PC data for transaction %u", xid);
+	elog(DEBUG2, "removing 2PC data for transaction %u of epoch %u ",
+		 XidFromFullTransactionId(fxid),
+		 EpochFromFullTransactionId(fxid));
+
 	if (gxact->ondisk)
-		RemoveTwoPhaseFile(xid, giveWarning);
+		RemoveTwoPhaseFile(fxid, giveWarning);
+
 	RemoveGXact(gxact);
 }
 
 /*
+ * Wrapper of PrepareRedoRemoveFull(), for TransactionIds.
+ */
+void
+PrepareRedoRemove(TransactionId xid, bool giveWarning)
+{
+	FullTransactionId fxid =
+		FullTransactionIdFromAllowableAt(TransamVariables->nextXid, xid);
+
+	PrepareRedoRemoveFull(fxid, giveWarning);
+}
+
+/*
  * LookupGXact
  *		Check if the prepared transaction with the given GID, lsn and timestamp
  *		exists.
@@ -2648,7 +2687,7 @@ LookupGXact(const char *gid, XLogRecPtr prepare_end_lsn,
 			 * between publisher and subscriber.
 			 */
 			if (gxact->ondisk)
-				buf = ReadTwoPhaseFile(gxact->xid, false);
+				buf = ReadTwoPhaseFile(gxact->fxid, false);
 			else
 			{
 				Assert(gxact->prepare_start_lsn);
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index b885513f765..41601fcb280 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2515,7 +2515,7 @@ static void
 PrepareTransaction(void)
 {
 	TransactionState s = CurrentTransactionState;
-	TransactionId xid = GetCurrentTransactionId();
+	FullTransactionId fxid = GetCurrentFullTransactionId();
 	GlobalTransaction gxact;
 	TimestampTz prepared_at;
 
@@ -2644,7 +2644,7 @@ PrepareTransaction(void)
 	 * Reserve the GID for this transaction. This could fail if the requested
 	 * GID is invalid or already in use.
 	 */
-	gxact = MarkAsPreparing(xid, prepareGID, prepared_at,
+	gxact = MarkAsPreparing(fxid, prepareGID, prepared_at,
 							GetUserId(), MyDatabaseId);
 	prepareGID = NULL;
 
@@ -2694,7 +2694,7 @@ PrepareTransaction(void)
 	 * ProcArrayClearTransaction().  Otherwise, a GetLockConflicts() would
 	 * conclude "xact already committed or aborted" for our locks.
 	 */
-	PostPrepare_Locks(xid);
+	PostPrepare_Locks(fxid);
 
 	/*
 	 * Let others know about no transaction in progress by me.  This has to be
@@ -2738,9 +2738,9 @@ PrepareTransaction(void)
 
 	PostPrepare_smgr();
 
-	PostPrepare_MultiXact(xid);
+	PostPrepare_MultiXact(fxid);
 
-	PostPrepare_PredicateLocks(xid);
+	PostPrepare_PredicateLocks(fxid);
 
 	ResourceOwnerRelease(TopTransactionResourceOwner,
 						 RESOURCE_RELEASE_LOCKS,
@@ -6420,7 +6420,8 @@ xact_redo(XLogReaderState *record)
 		 * gxact entry.
 		 */
 		LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
-		PrepareRedoAdd(XLogRecGetData(record),
+		PrepareRedoAdd(InvalidFullTransactionId,
+					   XLogRecGetData(record),
 					   record->ReadRecPtr,
 					   record->EndRecPtr,
 					   XLogRecGetOrigin(record));
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 1914859b2ee..a8cc6402d62 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -1028,7 +1028,7 @@ XLogInsertRecord(XLogRecData *rdata,
 		oldCxt = MemoryContextSwitchTo(walDebugCxt);
 
 		initStringInfo(&buf);
-		appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos));
+		appendStringInfo(&buf, "INSERT @ %X/%08X: ", LSN_FORMAT_ARGS(EndPos));
 
 		/*
 		 * We have to piece together the WAL record data from the XLogRecData
@@ -1549,8 +1549,8 @@ WaitXLogInsertionsToFinish(XLogRecPtr upto)
 	if (upto > reservedUpto)
 	{
 		ereport(LOG,
-				(errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X",
-						LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto))));
+				errmsg("request to flush past end of generated WAL; request %X/%08X, current position %X/%08X",
+					   LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto)));
 		upto = reservedUpto;
 	}
 
@@ -1716,7 +1716,7 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
 		endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
 
 		if (expectedEndPtr != endptr)
-			elog(PANIC, "could not find WAL buffer for %X/%X",
+			elog(PANIC, "could not find WAL buffer for %X/%08X",
 				 LSN_FORMAT_ARGS(ptr));
 	}
 	else
@@ -1776,7 +1776,7 @@ WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count,
 	inserted = pg_atomic_read_u64(&XLogCtl->logInsertResult);
 	if (startptr + count > inserted)
 		ereport(ERROR,
-				errmsg("cannot read past end of generated WAL: requested %X/%X, current position %X/%X",
+				errmsg("cannot read past end of generated WAL: requested %X/%08X, current position %X/%08X",
 					   LSN_FORMAT_ARGS(startptr + count),
 					   LSN_FORMAT_ARGS(inserted)));
 
@@ -2281,7 +2281,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
 #ifdef WAL_DEBUG
 	if (XLOG_DEBUG && npages > 0)
 	{
-		elog(DEBUG1, "initialized %d pages, up to %X/%X",
+		elog(DEBUG1, "initialized %d pages, up to %X/%08X",
 			 npages, LSN_FORMAT_ARGS(NewPageEndPtr));
 	}
 #endif
@@ -2492,7 +2492,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
 		XLogRecPtr	EndPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[curridx]);
 
 		if (LogwrtResult.Write >= EndPtr)
-			elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
+			elog(PANIC, "xlog write request %X/%08X is past end of log %X/%08X",
 				 LSN_FORMAT_ARGS(LogwrtResult.Write),
 				 LSN_FORMAT_ARGS(EndPtr));
 
@@ -2892,7 +2892,7 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
 		newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI);
 		if (!force && newMinRecoveryPoint < lsn)
 			elog(WARNING,
-				 "xlog min recovery request %X/%X is past current point %X/%X",
+				 "xlog min recovery request %X/%08X is past current point %X/%08X",
 				 LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint));
 
 		/* update control file */
@@ -2905,9 +2905,9 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
 			LocalMinRecoveryPointTLI = newMinRecoveryPointTLI;
 
 			ereport(DEBUG2,
-					(errmsg_internal("updated min recovery point to %X/%X on timeline %u",
-									 LSN_FORMAT_ARGS(newMinRecoveryPoint),
-									 newMinRecoveryPointTLI)));
+					errmsg_internal("updated min recovery point to %X/%08X on timeline %u",
+									LSN_FORMAT_ARGS(newMinRecoveryPoint),
+									newMinRecoveryPointTLI));
 		}
 	}
 	LWLockRelease(ControlFileLock);
@@ -2945,7 +2945,7 @@ XLogFlush(XLogRecPtr record)
 
 #ifdef WAL_DEBUG
 	if (XLOG_DEBUG)
-		elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
+		elog(LOG, "xlog flush request %X/%08X; write %X/%08X; flush %X/%08X",
 			 LSN_FORMAT_ARGS(record),
 			 LSN_FORMAT_ARGS(LogwrtResult.Write),
 			 LSN_FORMAT_ARGS(LogwrtResult.Flush));
@@ -3078,7 +3078,7 @@ XLogFlush(XLogRecPtr record)
 	 */
 	if (LogwrtResult.Flush < record)
 		elog(ERROR,
-			 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
+			 "xlog flush request %X/%08X is not satisfied --- flushed only to %X/%08X",
 			 LSN_FORMAT_ARGS(record),
 			 LSN_FORMAT_ARGS(LogwrtResult.Flush));
 }
@@ -3205,7 +3205,7 @@ XLogBackgroundFlush(void)
 
 #ifdef WAL_DEBUG
 	if (XLOG_DEBUG)
-		elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
+		elog(LOG, "xlog bg flush request write %X/%08X; flush: %X/%08X, current is write %X/%08X; flush %X/%08X",
 			 LSN_FORMAT_ARGS(WriteRqst.Write),
 			 LSN_FORMAT_ARGS(WriteRqst.Flush),
 			 LSN_FORMAT_ARGS(LogwrtResult.Write),
@@ -6921,7 +6921,7 @@ LogCheckpointEnd(bool restartpoint)
 						"%d removed, %d recycled; write=%ld.%03d s, "
 						"sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, "
 						"longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, "
-						"estimate=%d kB; lsn=%X/%X, redo lsn=%X/%X",
+						"estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X",
 						CheckpointStats.ckpt_bufs_written,
 						(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
 						CheckpointStats.ckpt_slru_written,
@@ -6945,7 +6945,7 @@ LogCheckpointEnd(bool restartpoint)
 						"%d removed, %d recycled; write=%ld.%03d s, "
 						"sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, "
 						"longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, "
-						"estimate=%d kB; lsn=%X/%X, redo lsn=%X/%X",
+						"estimate=%d kB; lsn=%X/%08X, redo lsn=%X/%08X",
 						CheckpointStats.ckpt_bufs_written,
 						(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
 						CheckpointStats.ckpt_slru_written,
@@ -7498,6 +7498,10 @@ CreateCheckPoint(int flags)
 	if (PriorRedoPtr != InvalidXLogRecPtr)
 		UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
 
+#ifdef USE_INJECTION_POINTS
+	INJECTION_POINT("checkpoint-before-old-wal-removal", NULL);
+#endif
+
 	/*
 	 * Delete old log files, those no longer needed for last checkpoint to
 	 * prevent the disk holding the xlog from growing full.
@@ -7637,7 +7641,7 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
 	if (!RecoveryInProgress())
 		elog(ERROR, "can only be used at end of recovery");
 	if (pagePtr % XLOG_BLCKSZ != 0)
-		elog(ERROR, "invalid position for missing continuation record %X/%X",
+		elog(ERROR, "invalid position for missing continuation record %X/%08X",
 			 LSN_FORMAT_ARGS(pagePtr));
 
 	/* The current WAL insert position should be right after the page header */
@@ -7648,7 +7652,7 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
 		startPos += SizeOfXLogShortPHD;
 	recptr = GetXLogInsertRecPtr();
 	if (recptr != startPos)
-		elog(ERROR, "invalid WAL insert position %X/%X for OVERWRITE_CONTRECORD",
+		elog(ERROR, "invalid WAL insert position %X/%08X for OVERWRITE_CONTRECORD",
 			 LSN_FORMAT_ARGS(recptr));
 
 	START_CRIT_SECTION();
@@ -7678,7 +7682,7 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
 
 	/* check that the record was inserted to the right place */
 	if (ProcLastRecPtr != startPos)
-		elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%X",
+		elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%08X",
 			 LSN_FORMAT_ARGS(ProcLastRecPtr));
 
 	XLogFlush(recptr);
@@ -7747,8 +7751,7 @@ RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record)
 	if (XLogHaveInvalidPages())
 	{
 		elog(DEBUG2,
-			 "could not record restart point at %X/%X because there "
-			 "are unresolved references to invalid pages",
+			 "could not record restart point at %X/%08X because there are unresolved references to invalid pages",
 			 LSN_FORMAT_ARGS(checkPoint->redo));
 		return;
 	}
@@ -7828,8 +7831,8 @@ CreateRestartPoint(int flags)
 		lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
 	{
 		ereport(DEBUG2,
-				(errmsg_internal("skipping restartpoint, already performed at %X/%X",
-								 LSN_FORMAT_ARGS(lastCheckPoint.redo))));
+				errmsg_internal("skipping restartpoint, already performed at %X/%08X",
+								LSN_FORMAT_ARGS(lastCheckPoint.redo)));
 
 		UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
 		if (flags & CHECKPOINT_IS_SHUTDOWN)
@@ -8013,10 +8016,10 @@ CreateRestartPoint(int flags)
 
 	xtime = GetLatestXTime();
 	ereport((log_checkpoints ? LOG : DEBUG2),
-			(errmsg("recovery restart point at %X/%X",
-					LSN_FORMAT_ARGS(lastCheckPoint.redo)),
-			 xtime ? errdetail("Last completed transaction was at log time %s.",
-							   timestamptz_to_str(xtime)) : 0));
+			errmsg("recovery restart point at %X/%08X",
+				   LSN_FORMAT_ARGS(lastCheckPoint.redo)),
+			xtime ? errdetail("Last completed transaction was at log time %s.",
+							  timestamptz_to_str(xtime)) : 0);
 
 	/*
 	 * Finally, execute archive_cleanup_command, if any.
@@ -8277,8 +8280,8 @@ XLogRestorePoint(const char *rpName)
 	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
 
 	ereport(LOG,
-			(errmsg("restore point \"%s\" created at %X/%X",
-					rpName, LSN_FORMAT_ARGS(RecPtr))));
+			errmsg("restore point \"%s\" created at %X/%08X",
+				   rpName, LSN_FORMAT_ARGS(RecPtr)));
 
 	return RecPtr;
 }
diff --git a/src/backend/access/transam/xlogbackup.c b/src/backend/access/transam/xlogbackup.c
index 342590e0a46..cda4b38b7d6 100644
--- a/src/backend/access/transam/xlogbackup.c
+++ b/src/backend/access/transam/xlogbackup.c
@@ -42,7 +42,7 @@ build_backup_content(BackupState *state, bool ishistoryfile)
 
 	XLByteToSeg(state->startpoint, startsegno, wal_segment_size);
 	XLogFileName(startxlogfile, state->starttli, startsegno, wal_segment_size);
-	appendStringInfo(result, "START WAL LOCATION: %X/%X (file %s)\n",
+	appendStringInfo(result, "START WAL LOCATION: %X/%08X (file %s)\n",
 					 LSN_FORMAT_ARGS(state->startpoint), startxlogfile);
 
 	if (ishistoryfile)
@@ -52,11 +52,11 @@ build_backup_content(BackupState *state, bool ishistoryfile)
 
 		XLByteToSeg(state->stoppoint, stopsegno, wal_segment_size);
 		XLogFileName(stopxlogfile, state->stoptli, stopsegno, wal_segment_size);
-		appendStringInfo(result, "STOP WAL LOCATION: %X/%X (file %s)\n",
+		appendStringInfo(result, "STOP WAL LOCATION: %X/%08X (file %s)\n",
 						 LSN_FORMAT_ARGS(state->stoppoint), stopxlogfile);
 	}
 
-	appendStringInfo(result, "CHECKPOINT LOCATION: %X/%X\n",
+	appendStringInfo(result, "CHECKPOINT LOCATION: %X/%08X\n",
 					 LSN_FORMAT_ARGS(state->checkpointloc));
 	appendStringInfoString(result, "BACKUP METHOD: streamed\n");
 	appendStringInfo(result, "BACKUP FROM: %s\n",
@@ -81,7 +81,7 @@ build_backup_content(BackupState *state, bool ishistoryfile)
 	Assert(XLogRecPtrIsInvalid(state->istartpoint) == (state->istarttli == 0));
 	if (!XLogRecPtrIsInvalid(state->istartpoint))
 	{
-		appendStringInfo(result, "INCREMENTAL FROM LSN: %X/%X\n",
+		appendStringInfo(result, "INCREMENTAL FROM LSN: %X/%08X\n",
 						 LSN_FORMAT_ARGS(state->istartpoint));
 		appendStringInfo(result, "INCREMENTAL FROM TLI: %u\n",
 						 state->istarttli);
diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c
index 7735562db01..ed3aacabc98 100644
--- a/src/backend/access/transam/xlogprefetcher.c
+++ b/src/backend/access/transam/xlogprefetcher.c
@@ -546,7 +546,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
 
 #ifdef XLOGPREFETCHER_DEBUG_LEVEL
 					elog(XLOGPREFETCHER_DEBUG_LEVEL,
-						 "suppressing all readahead until %X/%X is replayed due to possible TLI change",
+						 "suppressing all readahead until %X/%08X is replayed due to possible TLI change",
 						 LSN_FORMAT_ARGS(record->lsn));
 #endif
 
@@ -579,7 +579,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
 
 #ifdef XLOGPREFETCHER_DEBUG_LEVEL
 					elog(XLOGPREFETCHER_DEBUG_LEVEL,
-						 "suppressing prefetch in database %u until %X/%X is replayed due to raw file copy",
+						 "suppressing prefetch in database %u until %X/%08X is replayed due to raw file copy",
 						 rlocator.dbOid,
 						 LSN_FORMAT_ARGS(record->lsn));
 #endif
@@ -607,7 +607,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
 
 #ifdef XLOGPREFETCHER_DEBUG_LEVEL
 						elog(XLOGPREFETCHER_DEBUG_LEVEL,
-							 "suppressing prefetch in relation %u/%u/%u until %X/%X is replayed, which creates the relation",
+							 "suppressing prefetch in relation %u/%u/%u until %X/%08X is replayed, which creates the relation",
 							 xlrec->rlocator.spcOid,
 							 xlrec->rlocator.dbOid,
 							 xlrec->rlocator.relNumber,
@@ -630,7 +630,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
 
 #ifdef XLOGPREFETCHER_DEBUG_LEVEL
 					elog(XLOGPREFETCHER_DEBUG_LEVEL,
-						 "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, which truncates the relation",
+						 "suppressing prefetch in relation %u/%u/%u from block %u until %X/%08X is replayed, which truncates the relation",
 						 xlrec->rlocator.spcOid,
 						 xlrec->rlocator.dbOid,
 						 xlrec->rlocator.relNumber,
@@ -729,7 +729,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
 			{
 #ifdef XLOGPREFETCHER_DEBUG_LEVEL
 				elog(XLOGPREFETCHER_DEBUG_LEVEL,
-					 "suppressing all prefetch in relation %u/%u/%u until %X/%X is replayed, because the relation does not exist on disk",
+					 "suppressing all prefetch in relation %u/%u/%u until %X/%08X is replayed, because the relation does not exist on disk",
 					 reln->smgr_rlocator.locator.spcOid,
 					 reln->smgr_rlocator.locator.dbOid,
 					 reln->smgr_rlocator.locator.relNumber,
@@ -750,7 +750,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
 			{
 #ifdef XLOGPREFETCHER_DEBUG_LEVEL
 				elog(XLOGPREFETCHER_DEBUG_LEVEL,
-					 "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, because the relation is too small",
+					 "suppressing prefetch in relation %u/%u/%u from block %u until %X/%08X is replayed, because the relation is too small",
 					 reln->smgr_rlocator.locator.spcOid,
 					 reln->smgr_rlocator.locator.dbOid,
 					 reln->smgr_rlocator.locator.relNumber,
@@ -928,7 +928,7 @@ XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileLocator rlocator,
 		{
 #ifdef XLOGPREFETCHER_DEBUG_LEVEL
 			elog(XLOGPREFETCHER_DEBUG_LEVEL,
-				 "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (blocks >= %u filtered)",
+				 "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%08X is replayed (blocks >= %u filtered)",
 				 rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno,
 				 LSN_FORMAT_ARGS(filter->filter_until_replayed),
 				 filter->filter_from_block);
@@ -944,7 +944,7 @@ XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileLocator rlocator,
 		{
 #ifdef XLOGPREFETCHER_DEBUG_LEVEL
 			elog(XLOGPREFETCHER_DEBUG_LEVEL,
-				 "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (whole database)",
+				 "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%08X is replayed (whole database)",
 				 rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno,
 				 LSN_FORMAT_ARGS(filter->filter_until_replayed));
 #endif
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index 2790ade1f91..ac1f801b1eb 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -617,7 +617,7 @@ restart:
 	}
 	else if (targetRecOff < pageHeaderSize)
 	{
-		report_invalid_record(state, "invalid record offset at %X/%X: expected at least %u, got %u",
+		report_invalid_record(state, "invalid record offset at %X/%08X: expected at least %u, got %u",
 							  LSN_FORMAT_ARGS(RecPtr),
 							  pageHeaderSize, targetRecOff);
 		goto err;
@@ -626,7 +626,7 @@ restart:
 	if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
 		targetRecOff == pageHeaderSize)
 	{
-		report_invalid_record(state, "contrecord is requested by %X/%X",
+		report_invalid_record(state, "contrecord is requested by %X/%08X",
 							  LSN_FORMAT_ARGS(RecPtr));
 		goto err;
 	}
@@ -667,7 +667,7 @@ restart:
 		if (total_len < SizeOfXLogRecord)
 		{
 			report_invalid_record(state,
-								  "invalid record length at %X/%X: expected at least %u, got %u",
+								  "invalid record length at %X/%08X: expected at least %u, got %u",
 								  LSN_FORMAT_ARGS(RecPtr),
 								  (uint32) SizeOfXLogRecord, total_len);
 			goto err;
@@ -756,7 +756,7 @@ restart:
 			if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD))
 			{
 				report_invalid_record(state,
-									  "there is no contrecord flag at %X/%X",
+									  "there is no contrecord flag at %X/%08X",
 									  LSN_FORMAT_ARGS(RecPtr));
 				goto err;
 			}
@@ -769,7 +769,7 @@ restart:
 				total_len != (pageHeader->xlp_rem_len + gotlen))
 			{
 				report_invalid_record(state,
-									  "invalid contrecord length %u (expected %lld) at %X/%X",
+									  "invalid contrecord length %u (expected %lld) at %X/%08X",
 									  pageHeader->xlp_rem_len,
 									  ((long long) total_len) - gotlen,
 									  LSN_FORMAT_ARGS(RecPtr));
@@ -1132,7 +1132,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
 	if (record->xl_tot_len < SizeOfXLogRecord)
 	{
 		report_invalid_record(state,
-							  "invalid record length at %X/%X: expected at least %u, got %u",
+							  "invalid record length at %X/%08X: expected at least %u, got %u",
 							  LSN_FORMAT_ARGS(RecPtr),
 							  (uint32) SizeOfXLogRecord, record->xl_tot_len);
 		return false;
@@ -1140,7 +1140,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
 	if (!RmgrIdIsValid(record->xl_rmid))
 	{
 		report_invalid_record(state,
-							  "invalid resource manager ID %u at %X/%X",
+							  "invalid resource manager ID %u at %X/%08X",
 							  record->xl_rmid, LSN_FORMAT_ARGS(RecPtr));
 		return false;
 	}
@@ -1153,7 +1153,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
 		if (!(record->xl_prev < RecPtr))
 		{
 			report_invalid_record(state,
-								  "record with incorrect prev-link %X/%X at %X/%X",
+								  "record with incorrect prev-link %X/%08X at %X/%08X",
 								  LSN_FORMAT_ARGS(record->xl_prev),
 								  LSN_FORMAT_ARGS(RecPtr));
 			return false;
@@ -1169,7 +1169,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
 		if (record->xl_prev != PrevRecPtr)
 		{
 			report_invalid_record(state,
-								  "record with incorrect prev-link %X/%X at %X/%X",
+								  "record with incorrect prev-link %X/%08X at %X/%08X",
 								  LSN_FORMAT_ARGS(record->xl_prev),
 								  LSN_FORMAT_ARGS(RecPtr));
 			return false;
@@ -1207,7 +1207,7 @@ ValidXLogRecord(XLogReaderState *state, XLogRecord *record, XLogRecPtr recptr)
 	if (!EQ_CRC32C(record->xl_crc, crc))
 	{
 		report_invalid_record(state,
-							  "incorrect resource manager data checksum in record at %X/%X",
+							  "incorrect resource manager data checksum in record at %X/%08X",
 							  LSN_FORMAT_ARGS(recptr));
 		return false;
 	}
@@ -1241,7 +1241,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr,
 		XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
 
 		report_invalid_record(state,
-							  "invalid magic number %04X in WAL segment %s, LSN %X/%X, offset %u",
+							  "invalid magic number %04X in WAL segment %s, LSN %X/%08X, offset %u",
 							  hdr->xlp_magic,
 							  fname,
 							  LSN_FORMAT_ARGS(recptr),
@@ -1256,7 +1256,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr,
 		XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
 
 		report_invalid_record(state,
-							  "invalid info bits %04X in WAL segment %s, LSN %X/%X, offset %u",
+							  "invalid info bits %04X in WAL segment %s, LSN %X/%08X, offset %u",
 							  hdr->xlp_info,
 							  fname,
 							  LSN_FORMAT_ARGS(recptr),
@@ -1298,7 +1298,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr,
 
 		/* hmm, first page of file doesn't have a long header? */
 		report_invalid_record(state,
-							  "invalid info bits %04X in WAL segment %s, LSN %X/%X, offset %u",
+							  "invalid info bits %04X in WAL segment %s, LSN %X/%08X, offset %u",
 							  hdr->xlp_info,
 							  fname,
 							  LSN_FORMAT_ARGS(recptr),
@@ -1318,7 +1318,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr,
 		XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
 
 		report_invalid_record(state,
-							  "unexpected pageaddr %X/%X in WAL segment %s, LSN %X/%X, offset %u",
+							  "unexpected pageaddr %X/%08X in WAL segment %s, LSN %X/%08X, offset %u",
 							  LSN_FORMAT_ARGS(hdr->xlp_pageaddr),
 							  fname,
 							  LSN_FORMAT_ARGS(recptr),
@@ -1344,7 +1344,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr,
 			XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize);
 
 			report_invalid_record(state,
-								  "out-of-sequence timeline ID %u (after %u) in WAL segment %s, LSN %X/%X, offset %u",
+								  "out-of-sequence timeline ID %u (after %u) in WAL segment %s, LSN %X/%08X, offset %u",
 								  hdr->xlp_tli,
 								  state->latestPageTLI,
 								  fname,
@@ -1756,7 +1756,7 @@ DecodeXLogRecord(XLogReaderState *state,
 			if (block_id <= decoded->max_block_id)
 			{
 				report_invalid_record(state,
-									  "out-of-order block_id %u at %X/%X",
+									  "out-of-order block_id %u at %X/%08X",
 									  block_id,
 									  LSN_FORMAT_ARGS(state->ReadRecPtr));
 				goto err;
@@ -1780,14 +1780,14 @@ DecodeXLogRecord(XLogReaderState *state,
 			if (blk->has_data && blk->data_len == 0)
 			{
 				report_invalid_record(state,
-									  "BKPBLOCK_HAS_DATA set, but no data included at %X/%X",
+									  "BKPBLOCK_HAS_DATA set, but no data included at %X/%08X",
 									  LSN_FORMAT_ARGS(state->ReadRecPtr));
 				goto err;
 			}
 			if (!blk->has_data && blk->data_len != 0)
 			{
 				report_invalid_record(state,
-									  "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%X",
+									  "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%08X",
 									  (unsigned int) blk->data_len,
 									  LSN_FORMAT_ARGS(state->ReadRecPtr));
 				goto err;
@@ -1823,7 +1823,7 @@ DecodeXLogRecord(XLogReaderState *state,
 					 blk->bimg_len == BLCKSZ))
 				{
 					report_invalid_record(state,
-										  "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X",
+										  "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%08X",
 										  (unsigned int) blk->hole_offset,
 										  (unsigned int) blk->hole_length,
 										  (unsigned int) blk->bimg_len,
@@ -1839,7 +1839,7 @@ DecodeXLogRecord(XLogReaderState *state,
 					(blk->hole_offset != 0 || blk->hole_length != 0))
 				{
 					report_invalid_record(state,
-										  "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X",
+										  "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%08X",
 										  (unsigned int) blk->hole_offset,
 										  (unsigned int) blk->hole_length,
 										  LSN_FORMAT_ARGS(state->ReadRecPtr));
@@ -1853,7 +1853,7 @@ DecodeXLogRecord(XLogReaderState *state,
 					blk->bimg_len == BLCKSZ)
 				{
 					report_invalid_record(state,
-										  "BKPIMAGE_COMPRESSED set, but block image length %u at %X/%X",
+										  "BKPIMAGE_COMPRESSED set, but block image length %u at %X/%08X",
 										  (unsigned int) blk->bimg_len,
 										  LSN_FORMAT_ARGS(state->ReadRecPtr));
 					goto err;
@@ -1868,7 +1868,7 @@ DecodeXLogRecord(XLogReaderState *state,
 					blk->bimg_len != BLCKSZ)
 				{
 					report_invalid_record(state,
-										  "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_COMPRESSED set, but block image length is %u at %X/%X",
+										  "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_COMPRESSED set, but block image length is %u at %X/%08X",
 										  (unsigned int) blk->data_len,
 										  LSN_FORMAT_ARGS(state->ReadRecPtr));
 					goto err;
@@ -1884,7 +1884,7 @@ DecodeXLogRecord(XLogReaderState *state,
 				if (rlocator == NULL)
 				{
 					report_invalid_record(state,
-										  "BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
+										  "BKPBLOCK_SAME_REL set but no previous rel at %X/%08X",
 										  LSN_FORMAT_ARGS(state->ReadRecPtr));
 					goto err;
 				}
@@ -1896,7 +1896,7 @@ DecodeXLogRecord(XLogReaderState *state,
 		else
 		{
 			report_invalid_record(state,
-								  "invalid block_id %u at %X/%X",
+								  "invalid block_id %u at %X/%08X",
 								  block_id, LSN_FORMAT_ARGS(state->ReadRecPtr));
 			goto err;
 		}
@@ -1963,7 +1963,7 @@ DecodeXLogRecord(XLogReaderState *state,
 
 shortdata_err:
 	report_invalid_record(state,
-						  "record with invalid length at %X/%X",
+						  "record with invalid length at %X/%08X",
 						  LSN_FORMAT_ARGS(state->ReadRecPtr));
 err:
 	*errormsg = state->errormsg_buf;
@@ -2073,14 +2073,14 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
 		!record->record->blocks[block_id].in_use)
 	{
 		report_invalid_record(record,
-							  "could not restore image at %X/%X with invalid block %d specified",
+							  "could not restore image at %X/%08X with invalid block %d specified",
 							  LSN_FORMAT_ARGS(record->ReadRecPtr),
 							  block_id);
 		return false;
 	}
 	if (!record->record->blocks[block_id].has_image)
 	{
-		report_invalid_record(record, "could not restore image at %X/%X with invalid state, block %d",
+		report_invalid_record(record, "could not restore image at %X/%08X with invalid state, block %d",
 							  LSN_FORMAT_ARGS(record->ReadRecPtr),
 							  block_id);
 		return false;
@@ -2107,7 +2107,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
 									bkpb->bimg_len, BLCKSZ - bkpb->hole_length) <= 0)
 				decomp_success = false;
 #else
-			report_invalid_record(record, "could not restore image at %X/%X compressed with %s not supported by build, block %d",
+			report_invalid_record(record, "could not restore image at %X/%08X compressed with %s not supported by build, block %d",
 								  LSN_FORMAT_ARGS(record->ReadRecPtr),
 								  "LZ4",
 								  block_id);
@@ -2124,7 +2124,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
 			if (ZSTD_isError(decomp_result))
 				decomp_success = false;
 #else
-			report_invalid_record(record, "could not restore image at %X/%X compressed with %s not supported by build, block %d",
+			report_invalid_record(record, "could not restore image at %X/%08X compressed with %s not supported by build, block %d",
 								  LSN_FORMAT_ARGS(record->ReadRecPtr),
 								  "zstd",
 								  block_id);
@@ -2133,7 +2133,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
 		}
 		else
 		{
-			report_invalid_record(record, "could not restore image at %X/%X compressed with unknown method, block %d",
+			report_invalid_record(record, "could not restore image at %X/%08X compressed with unknown method, block %d",
 								  LSN_FORMAT_ARGS(record->ReadRecPtr),
 								  block_id);
 			return false;
@@ -2141,7 +2141,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
 
 		if (!decomp_success)
 		{
-			report_invalid_record(record, "could not decompress image at %X/%X, block %d",
+			report_invalid_record(record, "could not decompress image at %X/%08X, block %d",
 								  LSN_FORMAT_ARGS(record->ReadRecPtr),
 								  block_id);
 			return false;
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index 6ce979f2d8b..23878b2dd91 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -620,10 +620,10 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 		 * than ControlFile->checkPoint is used.
 		 */
 		ereport(LOG,
-				(errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u",
-						LSN_FORMAT_ARGS(RedoStartLSN),
-						LSN_FORMAT_ARGS(CheckPointLoc),
-						CheckPointTLI)));
+				errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u",
+					   LSN_FORMAT_ARGS(RedoStartLSN),
+					   LSN_FORMAT_ARGS(CheckPointLoc),
+					   CheckPointTLI));
 
 		/*
 		 * When a backup_label file is present, we want to roll forward from
@@ -636,8 +636,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 			memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
 			wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
 			ereport(DEBUG1,
-					(errmsg_internal("checkpoint record is at %X/%X",
-									 LSN_FORMAT_ARGS(CheckPointLoc))));
+					errmsg_internal("checkpoint record is at %X/%08X",
+									LSN_FORMAT_ARGS(CheckPointLoc)));
 			InRecovery = true;	/* force recovery even if SHUTDOWNED */
 
 			/*
@@ -652,23 +652,23 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 				if (!ReadRecord(xlogprefetcher, LOG, false,
 								checkPoint.ThisTimeLineID))
 					ereport(FATAL,
-							(errmsg("could not find redo location %X/%X referenced by checkpoint record at %X/%X",
-									LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
-							 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
-									 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
-									 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
-									 DataDir, DataDir, DataDir, DataDir)));
+							errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
+								   LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
+							errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
+									"If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
+									"Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
+									DataDir, DataDir, DataDir, DataDir));
 			}
 		}
 		else
 		{
 			ereport(FATAL,
-					(errmsg("could not locate required checkpoint record at %X/%X",
-							LSN_FORMAT_ARGS(CheckPointLoc)),
-					 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
-							 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
-							 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
-							 DataDir, DataDir, DataDir, DataDir)));
+					errmsg("could not locate required checkpoint record at %X/%08X",
+						   LSN_FORMAT_ARGS(CheckPointLoc)),
+					errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
+							"If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
+							"Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
+							DataDir, DataDir, DataDir, DataDir));
 			wasShutdown = false;	/* keep compiler quiet */
 		}
 
@@ -773,8 +773,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 		 */
 		if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
 			ereport(LOG,
-					(errmsg("restarting backup recovery with redo LSN %X/%X",
-							LSN_FORMAT_ARGS(ControlFile->backupStartPoint))));
+					errmsg("restarting backup recovery with redo LSN %X/%08X",
+						   LSN_FORMAT_ARGS(ControlFile->backupStartPoint)));
 
 		/* Get the last valid checkpoint record. */
 		CheckPointLoc = ControlFile->checkPoint;
@@ -786,8 +786,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 		if (record != NULL)
 		{
 			ereport(DEBUG1,
-					(errmsg_internal("checkpoint record is at %X/%X",
-									 LSN_FORMAT_ARGS(CheckPointLoc))));
+					errmsg_internal("checkpoint record is at %X/%08X",
+									LSN_FORMAT_ARGS(CheckPointLoc)));
 		}
 		else
 		{
@@ -798,8 +798,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 			 * simplify processing around checkpoints.
 			 */
 			ereport(PANIC,
-					(errmsg("could not locate a valid checkpoint record at %X/%X",
-							LSN_FORMAT_ARGS(CheckPointLoc))));
+					errmsg("could not locate a valid checkpoint record at %X/%08X",
+						   LSN_FORMAT_ARGS(CheckPointLoc)));
 		}
 		memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
 		wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
@@ -824,8 +824,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 							recoveryTargetName)));
 		else if (recoveryTarget == RECOVERY_TARGET_LSN)
 			ereport(LOG,
-					(errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
-							LSN_FORMAT_ARGS(recoveryTargetLSN))));
+					errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"",
+						   LSN_FORMAT_ARGS(recoveryTargetLSN)));
 		else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
 			ereport(LOG,
 					(errmsg("starting point-in-time recovery to earliest consistent point")));
@@ -855,7 +855,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 				(errmsg("requested timeline %u is not a child of this server's history",
 						recoveryTargetTLI),
 		/* translator: %s is a backup_label file or a pg_control file */
-				 errdetail("Latest checkpoint in file \"%s\" is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
+				 errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.",
 						   haveBackupLabel ? "backup_label" : "pg_control",
 						   LSN_FORMAT_ARGS(CheckPointLoc),
 						   CheckPointTLI,
@@ -870,15 +870,15 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 		tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
 		ControlFile->minRecoveryPointTLI)
 		ereport(FATAL,
-				(errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
-						recoveryTargetTLI,
-						LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
-						ControlFile->minRecoveryPointTLI)));
+				errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u",
+					   recoveryTargetTLI,
+					   LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
+					   ControlFile->minRecoveryPointTLI));
 
 	ereport(DEBUG1,
-			(errmsg_internal("redo record is at %X/%X; shutdown %s",
-							 LSN_FORMAT_ARGS(checkPoint.redo),
-							 wasShutdown ? "true" : "false")));
+			errmsg_internal("redo record is at %X/%08X; shutdown %s",
+							LSN_FORMAT_ARGS(checkPoint.redo),
+							wasShutdown ? "true" : "false"));
 	ereport(DEBUG1,
 			(errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
 							 U64FromFullTransactionId(checkPoint.nextXid),
@@ -1253,14 +1253,14 @@ read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
 	 * is pretty crude, but we are not expecting any variability in the file
 	 * format).
 	 */
-	if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
+	if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c",
 			   &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
 		ereport(FATAL,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
 	RedoStartLSN = ((uint64) hi) << 32 | lo;
 	RedoStartTLI = tli_from_walseg;
-	if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
+	if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c",
 			   &hi, &lo, &ch) != 3 || ch != '\n')
 		ereport(FATAL,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
@@ -1332,7 +1332,7 @@ read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
 								 tli_from_file, BACKUP_LABEL_FILE)));
 	}
 
-	if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0)
+	if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0)
 		ereport(FATAL,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 				 errmsg("this is an incremental backup, not a data directory"),
@@ -1722,8 +1722,8 @@ PerformWalRecovery(void)
 		if (record->xl_rmid != RM_XLOG_ID ||
 			(record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
 			ereport(FATAL,
-					(errmsg("unexpected record type found at redo point %X/%X",
-							LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
+					errmsg("unexpected record type found at redo point %X/%08X",
+						   LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
 	}
 	else
 	{
@@ -1745,8 +1745,8 @@ PerformWalRecovery(void)
 		RmgrStartup();
 
 		ereport(LOG,
-				(errmsg("redo starts at %X/%X",
-						LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
+				errmsg("redo starts at %X/%08X",
+					   LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
 
 		/* Prepare to report progress of the redo phase. */
 		if (!StandbyMode)
@@ -1758,7 +1758,7 @@ PerformWalRecovery(void)
 		do
 		{
 			if (!StandbyMode)
-				ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
+				ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X",
 										 LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
 
 #ifdef WAL_DEBUG
@@ -1767,7 +1767,7 @@ PerformWalRecovery(void)
 				StringInfoData buf;
 
 				initStringInfo(&buf);
-				appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
+				appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ",
 								 LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
 								 LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
 				xlog_outrec(&buf, xlogreader);
@@ -1880,9 +1880,9 @@ PerformWalRecovery(void)
 		RmgrCleanup();
 
 		ereport(LOG,
-				(errmsg("redo done at %X/%X system usage: %s",
-						LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
-						pg_rusage_show(&ru0))));
+				errmsg("redo done at %X/%08X system usage: %s",
+					   LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
+					   pg_rusage_show(&ru0)));
 		xtime = GetLatestXTime();
 		if (xtime)
 			ereport(LOG,
@@ -2092,7 +2092,7 @@ xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
 
 		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
 		if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
-			elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
+			elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X",
 				 LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
 				 LSN_FORMAT_ARGS(record->overwrittenRecPtr));
 
@@ -2101,9 +2101,9 @@ xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
 		missingContrecPtr = InvalidXLogRecPtr;
 
 		ereport(LOG,
-				(errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
-						LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
-						timestamptz_to_str(xlrec.overwrite_time))));
+				errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s",
+					   LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
+					   timestamptz_to_str(xlrec.overwrite_time)));
 
 		/* Verifying the record should only happen once */
 		record->overwrittenRecPtr = InvalidXLogRecPtr;
@@ -2129,7 +2129,7 @@ xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
 			backupEndPoint = lsn;
 		}
 		else
-			elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
+			elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X",
 				 LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
 	}
 }
@@ -2224,9 +2224,9 @@ CheckRecoveryConsistency(void)
 		backupEndRequired = false;
 
 		ereport(LOG,
-				(errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X",
-						LSN_FORMAT_ARGS(saveBackupStartPoint),
-						LSN_FORMAT_ARGS(saveBackupEndPoint))));
+				errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X",
+					   LSN_FORMAT_ARGS(saveBackupStartPoint),
+					   LSN_FORMAT_ARGS(saveBackupEndPoint)));
 	}
 
 	/*
@@ -2255,8 +2255,8 @@ CheckRecoveryConsistency(void)
 		reachedConsistency = true;
 		SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
 		ereport(LOG,
-				(errmsg("consistent recovery state reached at %X/%X",
-						LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
+				errmsg("consistent recovery state reached at %X/%08X",
+					   LSN_FORMAT_ARGS(lastReplayedEndRecPtr)));
 	}
 
 	/*
@@ -2293,7 +2293,7 @@ rm_redo_error_callback(void *arg)
 	xlog_block_info(&buf, record);
 
 	/* translator: %s is a WAL record description */
-	errcontext("WAL redo at %X/%X for %s",
+	errcontext("WAL redo at %X/%08X for %s",
 			   LSN_FORMAT_ARGS(record->ReadRecPtr),
 			   buf.data);
 
@@ -2328,7 +2328,7 @@ xlog_outdesc(StringInfo buf, XLogReaderState *record)
 static void
 xlog_outrec(StringInfo buf, XLogReaderState *record)
 {
-	appendStringInfo(buf, "prev %X/%X; xid %u",
+	appendStringInfo(buf, "prev %X/%08X; xid %u",
 					 LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
 					 XLogRecGetXid(record));
 
@@ -2416,10 +2416,10 @@ checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
 		lsn < minRecoveryPoint &&
 		newTLI > minRecoveryPointTLI)
 		ereport(PANIC,
-				(errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
-						newTLI,
-						LSN_FORMAT_ARGS(minRecoveryPoint),
-						minRecoveryPointTLI)));
+				errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u",
+					   newTLI,
+					   LSN_FORMAT_ARGS(minRecoveryPoint),
+					   minRecoveryPointTLI));
 
 	/* Looks good */
 }
@@ -2621,8 +2621,8 @@ recoveryStopsBefore(XLogReaderState *record)
 		recoveryStopTime = 0;
 		recoveryStopName[0] = '\0';
 		ereport(LOG,
-				(errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
-						LSN_FORMAT_ARGS(recoveryStopLSN))));
+				errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"",
+					   LSN_FORMAT_ARGS(recoveryStopLSN)));
 		return true;
 	}
 
@@ -2789,8 +2789,8 @@ recoveryStopsAfter(XLogReaderState *record)
 		recoveryStopTime = 0;
 		recoveryStopName[0] = '\0';
 		ereport(LOG,
-				(errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
-						LSN_FORMAT_ARGS(recoveryStopLSN))));
+				errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"",
+					   LSN_FORMAT_ARGS(recoveryStopLSN)));
 		return true;
 	}
 
@@ -2910,7 +2910,7 @@ getRecoveryStopReason(void)
 				 timestamptz_to_str(recoveryStopTime));
 	else if (recoveryTarget == RECOVERY_TARGET_LSN)
 		snprintf(reason, sizeof(reason),
-				 "%s LSN %X/%X\n",
+				 "%s LSN %X/%08X\n",
 				 recoveryStopAfter ? "after" : "before",
 				 LSN_FORMAT_ARGS(recoveryStopLSN));
 	else if (recoveryTarget == RECOVERY_TARGET_NAME)
@@ -3213,11 +3213,11 @@ ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
 			XLogFileName(fname, xlogreader->seg.ws_tli, segno,
 						 wal_segment_size);
 			ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
-					(errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u",
-							xlogreader->latestPageTLI,
-							fname,
-							LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
-							offset)));
+					errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u",
+						   xlogreader->latestPageTLI,
+						   fname,
+						   LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
+						   offset));
 			record = NULL;
 		}
 
@@ -3429,14 +3429,14 @@ retry:
 			errno = save_errno;
 			ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
 					(errcode_for_file_access(),
-					 errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m",
+					 errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m",
 							fname, LSN_FORMAT_ARGS(targetPagePtr),
 							readOff)));
 		}
 		else
 			ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
 					(errcode(ERRCODE_DATA_CORRUPTED),
-					 errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu",
+					 errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu",
 							fname, LSN_FORMAT_ARGS(targetPagePtr),
 							readOff, r, (Size) XLOG_BLCKSZ)));
 		goto next_record_is_invalid;
@@ -3718,7 +3718,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 						wait_time = wal_retrieve_retry_interval -
 							TimestampDifferenceMilliseconds(last_fail_time, now);
 
-						elog(LOG, "waiting for WAL to become available at %X/%X",
+						elog(LOG, "waiting for WAL to become available at %X/%08X",
 							 LSN_FORMAT_ARGS(RecPtr));
 
 						/* Do background tasks that might benefit us later. */
@@ -3864,7 +3864,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 							tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
 
 							if (curFileTLI > 0 && tli < curFileTLI)
-								elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
+								elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
 									 LSN_FORMAT_ARGS(tliRecPtr),
 									 tli, curFileTLI);
 						}
@@ -4177,10 +4177,10 @@ rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
 	if (currentTle->end < replayLSN)
 	{
 		ereport(LOG,
-				(errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
-						newtarget,
-						replayTLI,
-						LSN_FORMAT_ARGS(replayLSN))));
+				errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X",
+					   newtarget,
+					   replayTLI,
+					   LSN_FORMAT_ARGS(replayLSN)));
 		return false;
 	}
 
@@ -4994,13 +4994,25 @@ check_recovery_target_timeline(char **newval, void **extra, GucSource source)
 		rttg = RECOVERY_TARGET_TIMELINE_LATEST;
 	else
 	{
+		char	   *endp;
+		uint64		timeline;
+
 		rttg = RECOVERY_TARGET_TIMELINE_NUMERIC;
 
 		errno = 0;
-		strtoul(*newval, NULL, 0);
-		if (errno == EINVAL || errno == ERANGE)
+		timeline = strtou64(*newval, &endp, 0);
+
+		if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
+		{
+			GUC_check_errdetail("\"%s\" is not a valid number.",
+								"recovery_target_timeline");
+			return false;
+		}
+
+		if (timeline < 1 || timeline > PG_UINT32_MAX)
 		{
-			GUC_check_errdetail("\"recovery_target_timeline\" is not a valid number.");
+			GUC_check_errdetail("\"%s\" must be between %u and %u.",
+								"recovery_target_timeline", 1, UINT_MAX);
 			return false;
 		}
 	}
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index c389b27f77d..27ea52fdfee 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -795,7 +795,7 @@ XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage,
 
 		list_free_deep(timelineHistory);
 
-		elog(DEBUG3, "switched to timeline %u valid until %X/%X",
+		elog(DEBUG3, "switched to timeline %u valid until %X/%08X",
 			 state->currTLI,
 			 LSN_FORMAT_ARGS(state->currTLIValidUntil));
 	}