95 files changed, 3106 insertions, 993 deletions
diff --git a/src/backend/access/common/toast_compression.c b/src/backend/access/common/toast_compression.c
index 21f2f4af97e..926f1e4008a 100644
--- a/src/backend/access/common/toast_compression.c
+++ b/src/backend/access/common/toast_compression.c
@@ -25,11 +25,11 @@
 /* GUC */
 int			default_toast_compression = TOAST_PGLZ_COMPRESSION;
 
-#define NO_LZ4_SUPPORT() \
+#define NO_COMPRESSION_SUPPORT(method) \
 	ereport(ERROR, \
 			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \
-			 errmsg("compression method lz4 not supported"), \
-			 errdetail("This functionality requires the server to be built with lz4 support.")))
+			 errmsg("compression method %s not supported", method), \
+			 errdetail("This functionality requires the server to be built with %s support.", method)))
 
 /*
  * Compress a varlena using PGLZ.
@@ -139,7 +139,7 @@ struct varlena *
 lz4_compress_datum(const struct varlena *value)
 {
 #ifndef USE_LZ4
-	NO_LZ4_SUPPORT();
+	NO_COMPRESSION_SUPPORT("lz4");
 	return NULL;				/* keep compiler quiet */
 #else
 	int32		valsize;
@@ -182,7 +182,7 @@ struct varlena *
 lz4_decompress_datum(const struct varlena *value)
 {
 #ifndef USE_LZ4
-	NO_LZ4_SUPPORT();
+	NO_COMPRESSION_SUPPORT("lz4");
 	return NULL;				/* keep compiler quiet */
 #else
 	int32		rawsize;
@@ -215,7 +215,7 @@ struct varlena *
 lz4_decompress_datum_slice(const struct varlena *value, int32 slicelength)
 {
 #ifndef USE_LZ4
-	NO_LZ4_SUPPORT();
+	NO_COMPRESSION_SUPPORT("lz4");
 	return NULL;				/* keep compiler quiet */
 #else
 	int32		rawsize;
@@ -289,7 +289,7 @@ CompressionNameToMethod(const char *compression)
 	else if (strcmp(compression, "lz4") == 0)
 	{
 #ifndef USE_LZ4
-		NO_LZ4_SUPPORT();
+		NO_COMPRESSION_SUPPORT("lz4");
 #endif
 		return TOAST_LZ4_COMPRESSION;
 	}
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 745a04ef26e..8f918e00af7 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -364,7 +364,7 @@ visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
 	{
 		*vmbuf = vm_readbuf(rel, mapBlock, false);
 		if (!BufferIsValid(*vmbuf))
-			return false;
+			return (uint8) 0;
 	}
 
 	map = PageGetContents(BufferGetPage(*vmbuf));
diff --git a/src/backend/access/index/amapi.c b/src/backend/access/index/amapi.c
index f0f4f974bce..60684c53422 100644
--- a/src/backend/access/index/amapi.c
+++ b/src/backend/access/index/amapi.c
@@ -42,6 +42,19 @@ GetIndexAmRoutine(Oid amhandler)
 		elog(ERROR, "index access method handler function %u did not return an IndexAmRoutine struct",
 			 amhandler);
 
+	/* Assert that all required callbacks are present. */
+	Assert(routine->ambuild != NULL);
+	Assert(routine->ambuildempty != NULL);
+	Assert(routine->aminsert != NULL);
+	Assert(routine->ambulkdelete != NULL);
+	Assert(routine->amvacuumcleanup != NULL);
+	Assert(routine->amcostestimate != NULL);
+	Assert(routine->amoptions != NULL);
+	Assert(routine->amvalidate != NULL);
+	Assert(routine->ambeginscan != NULL);
+	Assert(routine->amrescan != NULL);
+	Assert(routine->amendscan != NULL);
+
 	return routine;
 }
 
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 4af1ff1e9e5..d69798795b4 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -892,9 +892,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	OffsetNumber offnum;
 	BTScanInsertData inskey;
 	ScanKey		startKeys[INDEX_MAX_KEYS];
-	ScanKeyData notnullkeys[INDEX_MAX_KEYS];
+	ScanKeyData notnullkey;
 	int			keysz = 0;
-	StrategyNumber strat_total;
+	StrategyNumber strat_total = InvalidStrategy;
 	BlockNumber blkno = InvalidBlockNumber,
 				lastcurrblkno;
 
@@ -1034,7 +1034,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	 * need to be kept in sync.
 	 *----------
 	 */
-	strat_total = BTEqualStrategyNumber;
 	if (so->numberOfKeys > 0)
 	{
 		AttrNumber	curattr;
@@ -1122,16 +1121,15 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 					 ScanDirectionIsForward(dir) :
 					 ScanDirectionIsBackward(dir)))
 				{
-					/* Yes, so build the key in notnullkeys[keysz] */
-					bkey = &notnullkeys[keysz];
+					/* Final startKeys[] entry will be deduced NOT NULL key */
+					bkey = &notnullkey;
 					ScanKeyEntryInitialize(bkey,
 										   (SK_SEARCHNOTNULL | SK_ISNULL |
 											(impliesNN->sk_flags &
 											 (SK_BT_DESC | SK_BT_NULLS_FIRST))),
 										   curattr,
-										   ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
-											BTGreaterStrategyNumber :
-											BTLessStrategyNumber),
+										   ScanDirectionIsForward(dir) ?
+										   BTGreaterStrategyNumber : BTLessStrategyNumber,
 										   InvalidOid,
 										   InvalidOid,
 										   InvalidOid,
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 85cbe397cb2..7918176fc58 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -1183,7 +1183,11 @@ EndPrepare(GlobalTransaction gxact)
 	 * starting immediately after the WAL record is inserted could complete
 	 * without fsync'ing our state file.  (This is essentially the same kind
 	 * of race condition as the COMMIT-to-clog-write case that
-	 * RecordTransactionCommit uses DELAY_CHKPT_START for; see notes there.)
+	 * RecordTransactionCommit uses DELAY_CHKPT_IN_COMMIT for; see notes
+	 * there.) Note that DELAY_CHKPT_IN_COMMIT is used to find transactions in
+	 * the critical commit section. We need to know about such transactions
+	 * for conflict detection in logical replication. See
+	 * GetOldestActiveTransactionId(true, false) and its use.
 	 *
 	 * We save the PREPARE record's location in the gxact for later use by
 	 * CheckPointTwoPhase.
@@ -2298,7 +2302,7 @@ ProcessTwoPhaseBuffer(FullTransactionId fxid,
  *	RecordTransactionCommitPrepared
  *
  * This is basically the same as RecordTransactionCommit (q.v. if you change
- * this function): in particular, we must set DELAY_CHKPT_START to avoid a
+ * this function): in particular, we must set DELAY_CHKPT_IN_COMMIT to avoid a
  * race condition.
  *
  * We know the transaction made at least one XLOG entry (its PREPARE),
@@ -2318,7 +2322,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
 								const char *gid)
 {
 	XLogRecPtr	recptr;
-	TimestampTz committs = GetCurrentTimestamp();
+	TimestampTz committs;
 	bool		replorigin;
 
 	/*
@@ -2331,8 +2335,24 @@ RecordTransactionCommitPrepared(TransactionId xid,
 	START_CRIT_SECTION();
 
 	/* See notes in RecordTransactionCommit */
-	Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
-	MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+	Assert((MyProc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0);
+	MyProc->delayChkptFlags |= DELAY_CHKPT_IN_COMMIT;
+
+	/*
+	 * Ensures the DELAY_CHKPT_IN_COMMIT flag write is globally visible before
+	 * commit time is written.
+	 */
+	pg_write_barrier();
+
+	/*
+	 * Note it is important to set committs value after marking ourselves as
+	 * in the commit critical section (DELAY_CHKPT_IN_COMMIT). This is because
+	 * we want to ensure all transactions that have acquired commit timestamp
+	 * are finished before we allow the logical replication client to advance
+	 * its xid which is used to hold back dead rows for conflict detection.
+	 * See comments atop worker.c.
+	 */
+	committs = GetCurrentTimestamp();
 
 	/*
 	 * Emit the XLOG commit record. Note that we mark 2PC commits as
@@ -2381,7 +2401,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
 	TransactionIdCommitTree(xid, nchildren, children);
 
 	/* Checkpoint can proceed now */
-	MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+	MyProc->delayChkptFlags &= ~DELAY_CHKPT_IN_COMMIT;
 
 	END_CRIT_SECTION();
 
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 41601fcb280..b46e7e9c2a6 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -1431,10 +1431,22 @@ RecordTransactionCommit(void)
 		 * without holding the ProcArrayLock, since we're the only one
 		 * modifying it.  This makes checkpoint's determination of which xacts
 		 * are delaying the checkpoint a bit fuzzy, but it doesn't matter.
+		 *
+		 * Note, it is important to get the commit timestamp after marking the
+		 * transaction in the commit critical section. See
+		 * RecordTransactionCommitPrepared.
 		 */
-		Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
+		Assert((MyProc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0);
 		START_CRIT_SECTION();
-		MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+		MyProc->delayChkptFlags |= DELAY_CHKPT_IN_COMMIT;
+
+		Assert(xactStopTimestamp == 0);
+
+		/*
+		 * Ensures the DELAY_CHKPT_IN_COMMIT flag write is globally visible
+		 * before commit time is written.
+		 */
+		pg_write_barrier();
 
 		/*
 		 * Insert the commit XLOG record.
@@ -1537,7 +1549,7 @@ RecordTransactionCommit(void)
 	 */
 	if (markXidCommitted)
 	{
-		MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+		MyProc->delayChkptFlags &= ~DELAY_CHKPT_IN_COMMIT;
 		END_CRIT_SECTION();
 	}
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 8e7827c6ed9..5553c20fee8 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -96,6 +96,7 @@
 #include "utils/guc_hooks.h"
 #include "utils/guc_tables.h"
 #include "utils/injection_point.h"
+#include "utils/pgstat_internal.h"
 #include "utils/ps_status.h"
 #include "utils/relmapper.h"
 #include "utils/snapmgr.h"
@@ -702,7 +703,7 @@ static void InitControlFile(uint64 sysidentifier, uint32 data_checksum_version);
 static void WriteControlFile(void);
 static void ReadControlFile(void);
 static void UpdateControlFile(void);
-static char *str_time(pg_time_t tnow);
+static char *str_time(pg_time_t tnow, char *buf, size_t bufsize);
 
 static int	get_sync_bit(int method);
 
@@ -1091,6 +1092,9 @@ XLogInsertRecord(XLogRecData *rdata,
 		pgWalUsage.wal_bytes += rechdr->xl_tot_len;
 		pgWalUsage.wal_records++;
 		pgWalUsage.wal_fpi += num_fpi;
+
+		/* Required for the flush of pending stats WAL data */
+		pgstat_report_fixed = true;
 	}
 
 	return EndPos;
@@ -2108,6 +2112,12 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
 					LWLockRelease(WALWriteLock);
 					pgWalUsage.wal_buffers_full++;
 					TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
+
+					/*
+					 * Required for the flush of pending stats WAL data, per
+					 * update of pgWalUsage.
+					 */
+					pgstat_report_fixed = true;
 				}
 			}
 		}
@@ -5361,11 +5371,9 @@ BootStrapXLOG(uint32 data_checksum_version)
 }
 
 static char *
-str_time(pg_time_t tnow)
+str_time(pg_time_t tnow, char *buf, size_t bufsize)
 {
-	char	   *buf = palloc(128);
-
-	pg_strftime(buf, 128,
+	pg_strftime(buf, bufsize,
 				"%Y-%m-%d %H:%M:%S %Z",
 				pg_localtime(&tnow, log_timezone));
 
@@ -5608,6 +5616,7 @@ StartupXLOG(void)
 	XLogRecPtr	missingContrecPtr;
 	TransactionId oldestActiveXID;
 	bool		promoted = false;
+	char		timebuf[128];
 
 	/*
 	 * We should have an aux process resource owner to use, and we should not
@@ -5636,25 +5645,29 @@ StartupXLOG(void)
 			 */
 			ereport(IsPostmasterEnvironment ? LOG : NOTICE,
 					(errmsg("database system was shut down at %s",
-							str_time(ControlFile->time))));
+							str_time(ControlFile->time,
+									 timebuf, sizeof(timebuf)))));
 			break;
 
 		case DB_SHUTDOWNED_IN_RECOVERY:
 			ereport(LOG,
 					(errmsg("database system was shut down in recovery at %s",
-							str_time(ControlFile->time))));
+							str_time(ControlFile->time,
+									 timebuf, sizeof(timebuf)))));
 			break;
 
 		case DB_SHUTDOWNING:
 			ereport(LOG,
 					(errmsg("database system shutdown was interrupted; last known up at %s",
-							str_time(ControlFile->time))));
+							str_time(ControlFile->time,
+									 timebuf, sizeof(timebuf)))));
 			break;
 
 		case DB_IN_CRASH_RECOVERY:
 			ereport(LOG,
 					(errmsg("database system was interrupted while in recovery at %s",
-							str_time(ControlFile->time)),
+							str_time(ControlFile->time,
+									 timebuf, sizeof(timebuf))),
 					 errhint("This probably means that some data is corrupted and"
 							 " you will have to use the last backup for recovery.")));
 			break;
@@ -5662,7 +5675,8 @@ StartupXLOG(void)
 		case DB_IN_ARCHIVE_RECOVERY:
 			ereport(LOG,
 					(errmsg("database system was interrupted while in recovery at log time %s",
-							str_time(ControlFile->checkPointCopy.time)),
+							str_time(ControlFile->checkPointCopy.time,
+									 timebuf, sizeof(timebuf))),
 					 errhint("If this has occurred more than once some data might be corrupted"
 							 " and you might need to choose an earlier recovery target.")));
 			break;
@@ -5670,7 +5684,8 @@ StartupXLOG(void)
 		case DB_IN_PRODUCTION:
 			ereport(LOG,
 					(errmsg("database system was interrupted; last known up at %s",
-							str_time(ControlFile->time))));
+							str_time(ControlFile->time,
+									 timebuf, sizeof(timebuf)))));
 			break;
 
 		default:
@@ -6315,6 +6330,12 @@ StartupXLOG(void)
 	 */
 	CompleteCommitTsInitialization();
 
+	/* Clean up EndOfWalRecoveryInfo data to appease Valgrind leak checking */
+	if (endOfRecoveryInfo->lastPage)
+		pfree(endOfRecoveryInfo->lastPage);
+	pfree(endOfRecoveryInfo->recoveryStopReason);
+	pfree(endOfRecoveryInfo);
+
 	/*
 	 * All done with end-of-recovery actions.
 	 *
@@ -7121,7 +7142,7 @@ CreateCheckPoint(int flags)
 	 * starting snapshot of locks and transactions.
 	 */
 	if (!shutdown && XLogStandbyInfoActive())
-		checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
+		checkPoint.oldestActiveXid = GetOldestActiveTransactionId(false, true);
 	else
 		checkPoint.oldestActiveXid = InvalidTransactionId;
 
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index ac1f801b1eb..dcc8d4f9c1b 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -723,11 +723,12 @@ restart:
 			/* Calculate pointer to beginning of next page */
 			targetPagePtr += XLOG_BLCKSZ;
 
-			/* Wait for the next page to become available */
-			readOff = ReadPageInternal(state, targetPagePtr,
-									   Min(total_len - gotlen + SizeOfXLogShortPHD,
-										   XLOG_BLCKSZ));
-
+			/*
+			 * Read the page header before processing the record data, so we
+			 * can handle the case where the previous record ended as being a
+			 * partial one.
+			 */
+			readOff = ReadPageInternal(state, targetPagePtr, SizeOfXLogShortPHD);
 			if (readOff == XLREAD_WOULDBLOCK)
 				return XLREAD_WOULDBLOCK;
 			else if (readOff < 0)
@@ -776,6 +777,15 @@ restart:
 				goto err;
 			}
 
+			/* Wait for the next page to become available */
+			readOff = ReadPageInternal(state, targetPagePtr,
+									   Min(total_len - gotlen + SizeOfXLogShortPHD,
+										   XLOG_BLCKSZ));
+			if (readOff == XLREAD_WOULDBLOCK)
+				return XLREAD_WOULDBLOCK;
+			else if (readOff < 0)
+				goto err;
+
 			/* Append the continuation from this page to the buffer */
 			pageHeaderSize = XLogPageHeaderSize(pageHeader);
 
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index 23878b2dd91..f23ec8969c2 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -1626,6 +1626,7 @@ ShutdownWalRecovery(void)
 		close(readFile);
 		readFile = -1;
 	}
+	pfree(xlogreader->private_data);
 	XLogReaderFree(xlogreader);
 	XLogPrefetcherFree(xlogprefetcher);
 
@@ -4760,7 +4761,7 @@ bool
 check_primary_slot_name(char **newval, void **extra, GucSource source)
 {
 	if (*newval && strcmp(*newval, "") != 0 &&
-		!ReplicationSlotValidateName(*newval, WARNING))
+		!ReplicationSlotValidateName(*newval, false, WARNING))
 		return false;
 
 	return true;
diff --git a/src/backend/catalog/pg_subscription.c b/src/backend/catalog/pg_subscription.c
index 1395032413e..244acf52f36 100644
--- a/src/backend/catalog/pg_subscription.c
+++ b/src/backend/catalog/pg_subscription.c
@@ -103,6 +103,7 @@ GetSubscription(Oid subid, bool missing_ok)
 	sub->passwordrequired = subform->subpasswordrequired;
 	sub->runasowner = subform->subrunasowner;
 	sub->failover = subform->subfailover;
+	sub->retaindeadtuples = subform->subretaindeadtuples;
 
 	/* Get conninfo */
 	datum = SysCacheGetAttrNotNull(SUBSCRIPTIONOID,
@@ -319,7 +320,7 @@ AddSubscriptionRelState(Oid subid, Oid relid, char state,
  */
 void
 UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
-						   XLogRecPtr sublsn)
+						   XLogRecPtr sublsn, bool already_locked)
 {
 	Relation	rel;
 	HeapTuple	tup;
@@ -327,9 +328,24 @@ UpdateSubscriptionRelState(Oid subid, Oid relid, char state,
 	Datum		values[Natts_pg_subscription_rel];
 	bool		replaces[Natts_pg_subscription_rel];
 
-	LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock);
+	if (already_locked)
+	{
+#ifdef USE_ASSERT_CHECKING
+		LOCKTAG		tag;
 
-	rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+		Assert(CheckRelationOidLockedByMe(SubscriptionRelRelationId,
+										  RowExclusiveLock, true));
+		SET_LOCKTAG_OBJECT(tag, InvalidOid, SubscriptionRelationId, subid, 0);
+		Assert(LockHeldByMe(&tag, AccessShareLock, true));
+#endif
+
+		rel = table_open(SubscriptionRelRelationId, NoLock);
+	}
+	else
+	{
+		LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock);
+		rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+	}
 
 	/* Try finding existing mapping. */
 	tup = SearchSysCacheCopy2(SUBSCRIPTIONRELMAP,
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index b2d5332effc..f6eca09ee15 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1386,7 +1386,8 @@ REVOKE ALL ON pg_subscription FROM public;
 GRANT SELECT (oid, subdbid, subskiplsn, subname, subowner, subenabled,
               subbinary, substream, subtwophasestate, subdisableonerr,
 			  subpasswordrequired, subrunasowner, subfailover,
-              subslotname, subsynccommit, subpublications, suborigin)
+			  subretaindeadtuples, subslotname, subsynccommit,
+			  subpublications, suborigin)
     ON pg_subscription TO public;
 
 CREATE VIEW pg_stat_subscription_stats AS
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 7e2792ead71..8345bc0264b 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3582,6 +3582,7 @@ static void
 show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es)
 {
 	Plan	   *plan = ((PlanState *) mstate)->plan;
+	Memoize    *mplan = (Memoize *) plan;
 	ListCell   *lc;
 	List	   *context;
 	StringInfoData keystr;
@@ -3602,7 +3603,7 @@ show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es)
 									   plan,
 									   ancestors);
 
-	foreach(lc, ((Memoize *) plan)->param_exprs)
+	foreach(lc, mplan->param_exprs)
 	{
 		Node	   *expr = (Node *) lfirst(lc);
 
@@ -3618,6 +3619,24 @@ show_memoize_info(MemoizeState *mstate, List *ancestors, ExplainState *es)
 
 	pfree(keystr.data);
 
+	if (es->costs)
+	{
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			ExplainIndentText(es);
+			appendStringInfo(es->str, "Estimates: capacity=%u distinct keys=%.0f lookups=%.0f hit percent=%.2f%%\n",
+							 mplan->est_entries, mplan->est_unique_keys,
+							 mplan->est_calls, mplan->est_hit_ratio * 100.0);
+		}
+		else
+		{
+			ExplainPropertyUInteger("Estimated Capacity", NULL, mplan->est_entries, es);
+			ExplainPropertyFloat("Estimated Distinct Lookup Keys", NULL, mplan->est_unique_keys, 0, es);
+			ExplainPropertyFloat("Estimated Lookups", NULL, mplan->est_calls, 0, es);
+			ExplainPropertyFloat("Estimated Hit Percent", NULL, mplan->est_hit_ratio * 100.0, 2, es);
+		}
+	}
+
 	if (!es->analyze)
 		return;
 
diff --git a/src/backend/commands/foreigncmds.c b/src/backend/commands/foreigncmds.c
index 8d2d7431544..77f8461f42e 100644
--- a/src/backend/commands/foreigncmds.c
+++ b/src/backend/commands/foreigncmds.c
@@ -1588,6 +1588,7 @@ ImportForeignSchema(ImportForeignSchemaStmt *stmt)
 			pstmt->utilityStmt = (Node *) cstmt;
 			pstmt->stmt_location = rs->stmt_location;
 			pstmt->stmt_len = rs->stmt_len;
+			pstmt->planOrigin = PLAN_STMT_INTERNAL;
 
 			/* Execute statement */
 			ProcessUtility(pstmt, cmd, false,
diff --git a/src/backend/commands/schemacmds.c b/src/backend/commands/schemacmds.c
index 546160f0941..0f03d9743d2 100644
--- a/src/backend/commands/schemacmds.c
+++ b/src/backend/commands/schemacmds.c
@@ -215,6 +215,7 @@ CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString,
 		wrapper->utilityStmt = stmt;
 		wrapper->stmt_location = stmt_location;
 		wrapper->stmt_len = stmt_len;
+		wrapper->planOrigin = PLAN_STMT_INTERNAL;
 
 		/* do this step */
 		ProcessUtility(wrapper,
diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c
index e23b0de7242..cd6c3684482 100644
--- a/src/backend/commands/subscriptioncmds.c
+++ b/src/backend/commands/subscriptioncmds.c
@@ -14,6 +14,7 @@
 
 #include "postgres.h"
 
+#include "access/commit_ts.h"
 #include "access/htup_details.h"
 #include "access/table.h"
 #include "access/twophase.h"
@@ -71,8 +72,9 @@
 #define SUBOPT_PASSWORD_REQUIRED	0x00000800
 #define SUBOPT_RUN_AS_OWNER			0x00001000
 #define SUBOPT_FAILOVER				0x00002000
-#define SUBOPT_LSN					0x00004000
-#define SUBOPT_ORIGIN				0x00008000
+#define SUBOPT_RETAIN_DEAD_TUPLES	0x00004000
+#define SUBOPT_LSN					0x00008000
+#define SUBOPT_ORIGIN				0x00010000
 
 /* check if the 'val' has 'bits' set */
 #define IsSet(val, bits)  (((val) & (bits)) == (bits))
@@ -98,6 +100,7 @@ typedef struct SubOpts
 	bool		passwordrequired;
 	bool		runasowner;
 	bool		failover;
+	bool		retaindeadtuples;
 	char	   *origin;
 	XLogRecPtr	lsn;
 } SubOpts;
@@ -105,8 +108,10 @@ typedef struct SubOpts
 static List *fetch_table_list(WalReceiverConn *wrconn, List *publications);
 static void check_publications_origin(WalReceiverConn *wrconn,
 									  List *publications, bool copydata,
-									  char *origin, Oid *subrel_local_oids,
-									  int subrel_count, char *subname);
+									  bool retain_dead_tuples, char *origin,
+									  Oid *subrel_local_oids, int subrel_count,
+									  char *subname);
+static void check_pub_dead_tuple_retention(WalReceiverConn *wrconn);
 static void check_duplicates_in_publist(List *publist, Datum *datums);
 static List *merge_publications(List *oldpublist, List *newpublist, bool addpub, const char *subname);
 static void ReportSlotConnectionError(List *rstates, Oid subid, char *slotname, char *err);
@@ -162,6 +167,8 @@ parse_subscription_options(ParseState *pstate, List *stmt_options,
 		opts->runasowner = false;
 	if (IsSet(supported_opts, SUBOPT_FAILOVER))
 		opts->failover = false;
+	if (IsSet(supported_opts, SUBOPT_RETAIN_DEAD_TUPLES))
+		opts->retaindeadtuples = false;
 	if (IsSet(supported_opts, SUBOPT_ORIGIN))
 		opts->origin = pstrdup(LOGICALREP_ORIGIN_ANY);
 
@@ -210,7 +217,7 @@ parse_subscription_options(ParseState *pstate, List *stmt_options,
 			if (strcmp(opts->slot_name, "none") == 0)
 				opts->slot_name = NULL;
 			else
-				ReplicationSlotValidateName(opts->slot_name, ERROR);
+				ReplicationSlotValidateName(opts->slot_name, false, ERROR);
 		}
 		else if (IsSet(supported_opts, SUBOPT_COPY_DATA) &&
 				 strcmp(defel->defname, "copy_data") == 0)
@@ -307,6 +314,15 @@ parse_subscription_options(ParseState *pstate, List *stmt_options,
 			opts->specified_opts |= SUBOPT_FAILOVER;
 			opts->failover = defGetBoolean(defel);
 		}
+		else if (IsSet(supported_opts, SUBOPT_RETAIN_DEAD_TUPLES) &&
+				 strcmp(defel->defname, "retain_dead_tuples") == 0)
+		{
+			if (IsSet(opts->specified_opts, SUBOPT_RETAIN_DEAD_TUPLES))
+				errorConflictingDefElem(defel, pstate);
+
+			opts->specified_opts |= SUBOPT_RETAIN_DEAD_TUPLES;
+			opts->retaindeadtuples = defGetBoolean(defel);
+		}
 		else if (IsSet(supported_opts, SUBOPT_ORIGIN) &&
 				 strcmp(defel->defname, "origin") == 0)
 		{
@@ -563,7 +579,8 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt,
 					  SUBOPT_SYNCHRONOUS_COMMIT | SUBOPT_BINARY |
 					  SUBOPT_STREAMING | SUBOPT_TWOPHASE_COMMIT |
 					  SUBOPT_DISABLE_ON_ERR | SUBOPT_PASSWORD_REQUIRED |
-					  SUBOPT_RUN_AS_OWNER | SUBOPT_FAILOVER | SUBOPT_ORIGIN);
+					  SUBOPT_RUN_AS_OWNER | SUBOPT_FAILOVER |
+					  SUBOPT_RETAIN_DEAD_TUPLES | SUBOPT_ORIGIN);
 	parse_subscription_options(pstate, stmt->options, supported_opts, &opts);
 
 	/*
@@ -630,6 +647,10 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt,
 						stmt->subname)));
 	}
 
+	/* Ensure that we can enable retain_dead_tuples */
+	if (opts.retaindeadtuples)
+		CheckSubDeadTupleRetention(true, !opts.enabled, WARNING);
+
 	if (!IsSet(opts.specified_opts, SUBOPT_SLOT_NAME) &&
 		opts.slot_name == NULL)
 		opts.slot_name = stmt->subname;
@@ -670,6 +691,8 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt,
 	values[Anum_pg_subscription_subpasswordrequired - 1] = BoolGetDatum(opts.passwordrequired);
 	values[Anum_pg_subscription_subrunasowner - 1] = BoolGetDatum(opts.runasowner);
 	values[Anum_pg_subscription_subfailover - 1] = BoolGetDatum(opts.failover);
+	values[Anum_pg_subscription_subretaindeadtuples - 1] =
+		BoolGetDatum(opts.retaindeadtuples);
 	values[Anum_pg_subscription_subconninfo - 1] =
 		CStringGetTextDatum(conninfo);
 	if (opts.slot_name)
@@ -722,7 +745,11 @@ CreateSubscription(ParseState *pstate, CreateSubscriptionStmt *stmt,
 		{
 			check_publications(wrconn, publications);
 			check_publications_origin(wrconn, publications, opts.copy_data,
-									  opts.origin, NULL, 0, stmt->subname);
+									  opts.retaindeadtuples, opts.origin,
+									  NULL, 0, stmt->subname);
+
+			if (opts.retaindeadtuples)
+				check_pub_dead_tuple_retention(wrconn);
 
 			/*
 			 * Set sync state based on if we were asked to do data copy or
@@ -881,8 +908,8 @@ AlterSubscription_refresh(Subscription *sub, bool copy_data,
 			  sizeof(Oid), oid_cmp);
 
 		check_publications_origin(wrconn, sub->publications, copy_data,
-								  sub->origin, subrel_local_oids,
-								  subrel_count, sub->name);
+								  sub->retaindeadtuples, sub->origin,
+								  subrel_local_oids, subrel_count, sub->name);
 
 		/*
 		 * Rels that we want to remove from subscription and drop any slots
@@ -1040,18 +1067,22 @@ AlterSubscription_refresh(Subscription *sub, bool copy_data,
 }
 
 /*
- * Common checks for altering failover and two_phase options.
+ * Common checks for altering failover, two_phase, and retain_dead_tuples
+ * options.
  */
 static void
 CheckAlterSubOption(Subscription *sub, const char *option,
 					bool slot_needs_update, bool isTopLevel)
 {
+	Assert(strcmp(option, "failover") == 0 ||
+		   strcmp(option, "two_phase") == 0 ||
+		   strcmp(option, "retain_dead_tuples") == 0);
+
 	/*
-	 * The checks in this function are required only for failover and
-	 * two_phase options.
+	 * Altering the retain_dead_tuples option does not update the slot on the
+	 * publisher.
 	 */
-	Assert(strcmp(option, "failover") == 0 ||
-		   strcmp(option, "two_phase") == 0);
+	Assert(!slot_needs_update || strcmp(option, "retain_dead_tuples") != 0);
 
 	/*
 	 * Do not allow changing the option if the subscription is enabled. This
@@ -1063,6 +1094,39 @@ CheckAlterSubOption(Subscription *sub, const char *option,
 	 * the publisher by the existing walsender, so we could have allowed that
 	 * even when the subscription is enabled. But we kept this restriction for
 	 * the sake of consistency and simplicity.
+	 *
+	 * Additionally, do not allow changing the retain_dead_tuples option when
+	 * the subscription is enabled to prevent race conditions arising from the
+	 * new option value being acknowledged asynchronously by the launcher and
+	 * apply workers.
+	 *
+	 * Without the restriction, a race condition may arise when a user
+	 * disables and immediately re-enables the retain_dead_tuples option. In
+	 * this case, the launcher might drop the slot upon noticing the disabled
+	 * action, while the apply worker may keep maintaining
+	 * oldest_nonremovable_xid without noticing the option change. During this
+	 * period, a transaction ID wraparound could falsely make this ID appear
+	 * as if it originates from the future w.r.t the transaction ID stored in
+	 * the slot maintained by launcher.
+	 *
+	 * Similarly, if the user enables retain_dead_tuples concurrently with the
+	 * launcher starting the worker, the apply worker may start calculating
+	 * oldest_nonremovable_xid before the launcher notices the enable action.
+	 * Consequently, the launcher may update slot.xmin to a newer value than
+	 * that maintained by the worker. In subsequent cycles, upon integrating
+	 * the worker's oldest_nonremovable_xid, the launcher might detect a
+	 * retreat in the calculated xmin, necessitating additional handling.
+	 *
+	 * XXX To address the above race conditions, we can define
+	 * oldest_nonremovable_xid as FullTransactionID and adds the check to
+	 * disallow retreating the conflict slot's xmin. For now, we kept the
+	 * implementation simple by disallowing change to the retain_dead_tuples,
+	 * but in the future we can change this after some more analysis.
+	 *
+	 * Note that we could restrict only the enabling of retain_dead_tuples to
+	 * avoid the race conditions described above, but we maintain the
+	 * restriction for both enable and disable operations for the sake of
+	 * consistency.
 	 */
 	if (sub->enabled)
 		ereport(ERROR,
@@ -1110,6 +1174,9 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt,
 	bool		update_tuple = false;
 	bool		update_failover = false;
 	bool		update_two_phase = false;
+	bool		check_pub_rdt = false;
+	bool		retain_dead_tuples;
+	char	   *origin;
 	Subscription *sub;
 	Form_pg_subscription form;
 	bits32		supported_opts;
@@ -1137,6 +1204,9 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt,
 
 	sub = GetSubscription(subid, false);
 
+	retain_dead_tuples = sub->retaindeadtuples;
+	origin = sub->origin;
+
 	/*
 	 * Don't allow non-superuser modification of a subscription with
 	 * password_required=false.
@@ -1165,7 +1235,7 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt,
 								  SUBOPT_DISABLE_ON_ERR |
 								  SUBOPT_PASSWORD_REQUIRED |
 								  SUBOPT_RUN_AS_OWNER | SUBOPT_FAILOVER |
-								  SUBOPT_ORIGIN);
+								  SUBOPT_RETAIN_DEAD_TUPLES | SUBOPT_ORIGIN);
 
 				parse_subscription_options(pstate, stmt->options,
 										   supported_opts, &opts);
@@ -1325,11 +1395,62 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt,
 					replaces[Anum_pg_subscription_subfailover - 1] = true;
 				}
 
+				if (IsSet(opts.specified_opts, SUBOPT_RETAIN_DEAD_TUPLES))
+				{
+					values[Anum_pg_subscription_subretaindeadtuples - 1] =
+						BoolGetDatum(opts.retaindeadtuples);
+					replaces[Anum_pg_subscription_subretaindeadtuples - 1] = true;
+
+					CheckAlterSubOption(sub, "retain_dead_tuples", false, isTopLevel);
+
+					/*
+					 * Workers may continue running even after the
+					 * subscription has been disabled.
+					 *
+					 * To prevent race conditions (as described in
+					 * CheckAlterSubOption()), ensure that all worker
+					 * processes have already exited before proceeding.
+					 */
+					if (logicalrep_workers_find(subid, true, true))
+						ereport(ERROR,
+								(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+								 errmsg("cannot alter retain_dead_tuples when logical replication worker is still running"),
+								 errhint("Try again after some time.")));
+
+					/*
+					 * Remind the user that enabling subscription will prevent
+					 * the accumulation of dead tuples.
+					 */
+					if (opts.retaindeadtuples)
+						CheckSubDeadTupleRetention(true, !sub->enabled, NOTICE);
+
+					/*
+					 * Notify the launcher to manage the replication slot for
+					 * conflict detection. This ensures that replication slot
+					 * is efficiently handled (created, updated, or dropped)
+					 * in response to any configuration changes.
+					 */
+					ApplyLauncherWakeupAtCommit();
+
+					check_pub_rdt = opts.retaindeadtuples;
+					retain_dead_tuples = opts.retaindeadtuples;
+				}
+
 				if (IsSet(opts.specified_opts, SUBOPT_ORIGIN))
 				{
 					values[Anum_pg_subscription_suborigin - 1] =
 						CStringGetTextDatum(opts.origin);
 					replaces[Anum_pg_subscription_suborigin - 1] = true;
+
+					/*
+					 * Check if changes from different origins may be received
+					 * from the publisher when the origin is changed to ANY
+					 * and retain_dead_tuples is enabled.
+					 */
+					check_pub_rdt = retain_dead_tuples &&
+						pg_strcasecmp(opts.origin, LOGICALREP_ORIGIN_ANY) == 0;
+
+					origin = opts.origin;
 				}
 
 				update_tuple = true;
@@ -1347,6 +1468,15 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt,
 							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 							 errmsg("cannot enable subscription that does not have a slot name")));
 
+				/*
+				 * Check track_commit_timestamp only when enabling the
+				 * subscription in case it was disabled after creation. See
+				 * comments atop CheckSubDeadTupleRetention() for details.
+				 */
+				if (sub->retaindeadtuples)
+					CheckSubDeadTupleRetention(opts.enabled, !opts.enabled,
+											   WARNING);
+
 				values[Anum_pg_subscription_subenabled - 1] =
 					BoolGetDatum(opts.enabled);
 				replaces[Anum_pg_subscription_subenabled - 1] = true;
@@ -1355,6 +1485,14 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt,
 					ApplyLauncherWakeupAtCommit();
 
 				update_tuple = true;
+
+				/*
+				 * The subscription might be initially created with
+				 * connect=false and retain_dead_tuples=true, meaning the
+				 * remote server's status may not be checked. Ensure this
+				 * check is conducted now.
+				 */
+				check_pub_rdt = sub->retaindeadtuples && opts.enabled;
 				break;
 			}
 
@@ -1369,6 +1507,13 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt,
 				CStringGetTextDatum(stmt->conninfo);
 			replaces[Anum_pg_subscription_subconninfo - 1] = true;
 			update_tuple = true;
+
+			/*
+			 * Since the remote server configuration might have changed,
+			 * perform a check to ensure it permits enabling
+			 * retain_dead_tuples.
+			 */
+			check_pub_rdt = sub->retaindeadtuples;
 			break;
 
 		case ALTER_SUBSCRIPTION_SET_PUBLICATION:
@@ -1568,14 +1713,15 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt,
 	}
 
 	/*
-	 * Try to acquire the connection necessary for altering the slot, if
-	 * needed.
+	 * Try to acquire the connection necessary either for modifying the slot
+	 * or for checking if the remote server permits enabling
+	 * retain_dead_tuples.
 	 *
 	 * This has to be at the end because otherwise if there is an error while
 	 * doing the database operations we won't be able to rollback altered
 	 * slot.
 	 */
-	if (update_failover || update_two_phase)
+	if (update_failover || update_two_phase || check_pub_rdt)
 	{
 		bool		must_use_password;
 		char	   *err;
@@ -1584,10 +1730,14 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt,
 		/* Load the library providing us libpq calls. */
 		load_file("libpqwalreceiver", false);
 
-		/* Try to connect to the publisher. */
+		/*
+		 * Try to connect to the publisher, using the new connection string if
+		 * available.
+		 */
 		must_use_password = sub->passwordrequired && !sub->ownersuperuser;
-		wrconn = walrcv_connect(sub->conninfo, true, true, must_use_password,
-								sub->name, &err);
+		wrconn = walrcv_connect(stmt->conninfo ? stmt->conninfo : sub->conninfo,
+								true, true, must_use_password, sub->name,
+								&err);
 		if (!wrconn)
 			ereport(ERROR,
 					(errcode(ERRCODE_CONNECTION_FAILURE),
@@ -1596,9 +1746,17 @@ AlterSubscription(ParseState *pstate, AlterSubscriptionStmt *stmt,
 
 		PG_TRY();
 		{
-			walrcv_alter_slot(wrconn, sub->slotname,
-							  update_failover ? &opts.failover : NULL,
-							  update_two_phase ? &opts.twophase : NULL);
+			if (retain_dead_tuples)
+				check_pub_dead_tuple_retention(wrconn);
+
+			check_publications_origin(wrconn, sub->publications, false,
+									  retain_dead_tuples, origin, NULL, 0,
+									  sub->name);
+
+			if (update_failover || update_two_phase)
+				walrcv_alter_slot(wrconn, sub->slotname,
+								  update_failover ? &opts.failover : NULL,
+								  update_two_phase ? &opts.twophase : NULL);
 		}
 		PG_FINALLY();
 		{
@@ -2086,20 +2244,29 @@ AlterSubscriptionOwner_oid(Oid subid, Oid newOwnerId)
  * Check and log a warning if the publisher has subscribed to the same table,
  * its partition ancestors (if it's a partition), or its partition children (if
  * it's a partitioned table), from some other publishers. This check is
- * required only if "copy_data = true" and "origin = none" for CREATE
- * SUBSCRIPTION and ALTER SUBSCRIPTION ... REFRESH statements to notify the
- * user that data having origin might have been copied.
+ * required in the following scenarios:
  *
- * This check need not be performed on the tables that are already added
- * because incremental sync for those tables will happen through WAL and the
- * origin of the data can be identified from the WAL records.
+ * 1) For CREATE SUBSCRIPTION and ALTER SUBSCRIPTION ... REFRESH statements
+ *    with "copy_data = true" and "origin = none":
+ *    - Warn the user that data with an origin might have been copied.
+ *    - This check is skipped for tables already added, as incremental sync via
+ *      WAL allows origin tracking. The list of such tables is in
+ *      subrel_local_oids.
  *
- * subrel_local_oids contains the list of relation oids that are already
- * present on the subscriber.
+ * 2) For CREATE SUBSCRIPTION and ALTER SUBSCRIPTION ... REFRESH statements
+ *    with "retain_dead_tuples = true" and "origin = any", and for ALTER
+ *    SUBSCRIPTION statements that modify retain_dead_tuples or origin, or
+ *    when the publisher's status changes (e.g., due to a connection string
+ *    update):
+ *    - Warn the user that only conflict detection info for local changes on
+ *      the publisher is retained. Data from other origins may lack sufficient
+ *      details for reliable conflict detection.
+ *    - See comments atop worker.c for more details.
  */
 static void
 check_publications_origin(WalReceiverConn *wrconn, List *publications,
-						  bool copydata, char *origin, Oid *subrel_local_oids,
+						  bool copydata, bool retain_dead_tuples,
+						  char *origin, Oid *subrel_local_oids,
 						  int subrel_count, char *subname)
 {
 	WalRcvExecResult *res;
@@ -2108,9 +2275,29 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications,
 	Oid			tableRow[1] = {TEXTOID};
 	List	   *publist = NIL;
 	int			i;
+	bool		check_rdt;
+	bool		check_table_sync;
+	bool		origin_none = origin &&
+		pg_strcasecmp(origin, LOGICALREP_ORIGIN_NONE) == 0;
+
+	/*
+	 * Enable retain_dead_tuples checks only when origin is set to 'any',
+	 * since with origin='none' only local changes are replicated to the
+	 * subscriber.
+	 */
+	check_rdt = retain_dead_tuples && !origin_none;
+
+	/*
+	 * Enable table synchronization checks only when origin is 'none', to
+	 * ensure that data from other origins is not inadvertently copied.
+	 */
+	check_table_sync = copydata && origin_none;
 
-	if (!copydata || !origin ||
-		(pg_strcasecmp(origin, LOGICALREP_ORIGIN_NONE) != 0))
+	/* retain_dead_tuples and table sync checks occur separately */
+	Assert(!(check_rdt && check_table_sync));
+
+	/* Return if no checks are required */
+	if (!check_rdt && !check_table_sync)
 		return;
 
 	initStringInfo(&cmd);
@@ -2129,16 +2316,23 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications,
 	/*
 	 * In case of ALTER SUBSCRIPTION ... REFRESH, subrel_local_oids contains
 	 * the list of relation oids that are already present on the subscriber.
-	 * This check should be skipped for these tables.
+	 * This check should be skipped for these tables if checking for table
+	 * sync scenario. However, when handling the retain_dead_tuples scenario,
+	 * ensure all tables are checked, as some existing tables may now include
+	 * changes from other origins due to newly created subscriptions on the
+	 * publisher.
 	 */
-	for (i = 0; i < subrel_count; i++)
+	if (check_table_sync)
 	{
-		Oid			relid = subrel_local_oids[i];
-		char	   *schemaname = get_namespace_name(get_rel_namespace(relid));
-		char	   *tablename = get_rel_name(relid);
+		for (i = 0; i < subrel_count; i++)
+		{
+			Oid			relid = subrel_local_oids[i];
+			char	   *schemaname = get_namespace_name(get_rel_namespace(relid));
+			char	   *tablename = get_rel_name(relid);
 
-		appendStringInfo(&cmd, "AND NOT (N.nspname = '%s' AND C.relname = '%s')\n",
-						 schemaname, tablename);
+			appendStringInfo(&cmd, "AND NOT (N.nspname = '%s' AND C.relname = '%s')\n",
+							 schemaname, tablename);
+		}
 	}
 
 	res = walrcv_exec(wrconn, cmd.data, 1, tableRow);
@@ -2173,22 +2367,37 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications,
 	 * XXX: For simplicity, we don't check whether the table has any data or
 	 * not. If the table doesn't have any data then we don't need to
 	 * distinguish between data having origin and data not having origin so we
-	 * can avoid logging a warning in that case.
+	 * can avoid logging a warning for table sync scenario.
 	 */
 	if (publist)
 	{
 		StringInfo	pubnames = makeStringInfo();
+		StringInfo	err_msg = makeStringInfo();
+		StringInfo	err_hint = makeStringInfo();
 
 		/* Prepare the list of publication(s) for warning message. */
 		GetPublicationsStr(publist, pubnames, false);
+
+		if (check_table_sync)
+		{
+			appendStringInfo(err_msg, _("subscription \"%s\" requested copy_data with origin = NONE but might copy data that had a different origin"),
+							 subname);
+			appendStringInfoString(err_hint, _("Verify that initial data copied from the publisher tables did not come from other origins."));
+		}
+		else
+		{
+			appendStringInfo(err_msg, _("subscription \"%s\" enabled retain_dead_tuples but might not reliably detect conflicts for changes from different origins"),
+							 subname);
+			appendStringInfoString(err_hint, _("Consider using origin = NONE or disabling retain_dead_tuples."));
+		}
+
 		ereport(WARNING,
 				errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				errmsg("subscription \"%s\" requested copy_data with origin = NONE but might copy data that had a different origin",
-					   subname),
-				errdetail_plural("The subscription being created subscribes to a publication (%s) that contains tables that are written to by other subscriptions.",
-								 "The subscription being created subscribes to publications (%s) that contain tables that are written to by other subscriptions.",
+				errmsg_internal("%s", err_msg->data),
+				errdetail_plural("The subscription subscribes to a publication (%s) that contains tables that are written to by other subscriptions.",
+								 "The subscription subscribes to publications (%s) that contain tables that are written to by other subscriptions.",
 								 list_length(publist), pubnames->data),
-				errhint("Verify that initial data copied from the publisher tables did not come from other origins."));
+				errhint_internal("%s", err_hint->data));
 	}
 
 	ExecDropSingleTupleTableSlot(slot);
@@ -2197,6 +2406,101 @@ check_publications_origin(WalReceiverConn *wrconn, List *publications,
 }
 
 /*
+ * Determine whether the retain_dead_tuples can be enabled based on the
+ * publisher's status.
+ *
+ * This option is disallowed if the publisher is running a version earlier
+ * than the PG19, or if the publisher is in recovery (i.e., it is a standby
+ * server).
+ *
+ * See comments atop worker.c for a detailed explanation.
+ */
+static void
+check_pub_dead_tuple_retention(WalReceiverConn *wrconn)
+{
+	WalRcvExecResult *res;
+	Oid			RecoveryRow[1] = {BOOLOID};
+	TupleTableSlot *slot;
+	bool		isnull;
+	bool		remote_in_recovery;
+
+	if (walrcv_server_version(wrconn) < 19000)
+		ereport(ERROR,
+				errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("cannot enable retain_dead_tuples if the publisher is running a version earlier than PostgreSQL 19"));
+
+	res = walrcv_exec(wrconn, "SELECT pg_is_in_recovery()", 1, RecoveryRow);
+
+	if (res->status != WALRCV_OK_TUPLES)
+		ereport(ERROR,
+				(errcode(ERRCODE_CONNECTION_FAILURE),
+				 errmsg("could not obtain recovery progress from the publisher: %s",
+						res->err)));
+
+	slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple);
+	if (!tuplestore_gettupleslot(res->tuplestore, true, false, slot))
+		elog(ERROR, "failed to fetch tuple for the recovery progress");
+
+	remote_in_recovery = DatumGetBool(slot_getattr(slot, 1, &isnull));
+
+	if (remote_in_recovery)
+		ereport(ERROR,
+				errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				errmsg("cannot enable retain_dead_tuples if the publisher is in recovery."));
+
+	ExecDropSingleTupleTableSlot(slot);
+
+	walrcv_clear_result(res);
+}
+
+/*
+ * Check if the subscriber's configuration is adequate to enable the
+ * retain_dead_tuples option.
+ *
+ * Issue an ERROR if the wal_level does not support the use of replication
+ * slots when check_guc is set to true.
+ *
+ * Issue a WARNING if track_commit_timestamp is not enabled when check_guc is
+ * set to true. This is only to highlight the importance of enabling
+ * track_commit_timestamp instead of catching all the misconfigurations, as
+ * this setting can be adjusted after subscription creation. Without it, the
+ * apply worker will simply skip conflict detection.
+ *
+ * Issue a WARNING or NOTICE if the subscription is disabled. Do not raise an
+ * ERROR since users can only modify retain_dead_tuples for disabled
+ * subscriptions. And as long as the subscription is enabled promptly, it will
+ * not pose issues.
+ */
+void
+CheckSubDeadTupleRetention(bool check_guc, bool sub_disabled,
+						   int elevel_for_sub_disabled)
+{
+	Assert(elevel_for_sub_disabled == NOTICE ||
+		   elevel_for_sub_disabled == WARNING);
+
+	if (check_guc && wal_level < WAL_LEVEL_REPLICA)
+		ereport(ERROR,
+				errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				errmsg("\"wal_level\" is insufficient to create the replication slot required by retain_dead_tuples"),
+				errhint("\"wal_level\" must be set to \"replica\" or \"logical\" at server start."));
+
+	if (check_guc && !track_commit_timestamp)
+		ereport(WARNING,
+				errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				errmsg("commit timestamp and origin data required for detecting conflicts won't be retained"),
+				errhint("Consider setting \"%s\" to true.",
+						"track_commit_timestamp"));
+
+	if (sub_disabled)
+		ereport(elevel_for_sub_disabled,
+				errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				errmsg("deleted rows to detect conflicts would not be removed until the subscription is enabled"),
+				(elevel_for_sub_disabled > NOTICE)
+				? errhint("Consider setting %s to false.",
+						  "retain_dead_tuples") : 0);
+}
+
+/*
  * Get the list of tables which belong to specified publications on the
  * publisher connection.
  *
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 67f8e70f9c1..7dc121f73f1 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -80,6 +80,7 @@ static bool GetTupleForTrigger(EState *estate,
 							   ItemPointer tid,
 							   LockTupleMode lockmode,
 							   TupleTableSlot *oldslot,
+							   bool do_epq_recheck,
 							   TupleTableSlot **epqslot,
 							   TM_Result *tmresultp,
 							   TM_FailureData *tmfdp);
@@ -2693,7 +2694,8 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate,
 					 HeapTuple fdw_trigtuple,
 					 TupleTableSlot **epqslot,
 					 TM_Result *tmresult,
-					 TM_FailureData *tmfd)
+					 TM_FailureData *tmfd,
+					 bool is_merge_delete)
 {
 	TupleTableSlot *slot = ExecGetTriggerOldSlot(estate, relinfo);
 	TriggerDesc *trigdesc = relinfo->ri_TrigDesc;
@@ -2708,9 +2710,17 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate,
 	{
 		TupleTableSlot *epqslot_candidate = NULL;
 
+		/*
+		 * Get a copy of the on-disk tuple we are planning to delete.  In
+		 * general, if the tuple has been concurrently updated, we should
+		 * recheck it using EPQ.  However, if this is a MERGE DELETE action,
+		 * we skip this EPQ recheck and leave it to the caller (it must do
+		 * additional rechecking, and might end up executing a different
+		 * action entirely).
+		 */
 		if (!GetTupleForTrigger(estate, epqstate, relinfo, tupleid,
-								LockTupleExclusive, slot, &epqslot_candidate,
-								tmresult, tmfd))
+								LockTupleExclusive, slot, !is_merge_delete,
+								&epqslot_candidate, tmresult, tmfd))
 			return false;
 
 		/*
@@ -2800,6 +2810,7 @@ ExecARDeleteTriggers(EState *estate,
 							   tupleid,
 							   LockTupleExclusive,
 							   slot,
+							   false,
 							   NULL,
 							   NULL,
 							   NULL);
@@ -2944,7 +2955,8 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate,
 					 HeapTuple fdw_trigtuple,
 					 TupleTableSlot *newslot,
 					 TM_Result *tmresult,
-					 TM_FailureData *tmfd)
+					 TM_FailureData *tmfd,
+					 bool is_merge_update)
 {
 	TriggerDesc *trigdesc = relinfo->ri_TrigDesc;
 	TupleTableSlot *oldslot = ExecGetTriggerOldSlot(estate, relinfo);
@@ -2965,10 +2977,17 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate,
 	{
 		TupleTableSlot *epqslot_candidate = NULL;
 
-		/* get a copy of the on-disk tuple we are planning to update */
+		/*
+		 * Get a copy of the on-disk tuple we are planning to update.  In
+		 * general, if the tuple has been concurrently updated, we should
+		 * recheck it using EPQ.  However, if this is a MERGE UPDATE action,
+		 * we skip this EPQ recheck and leave it to the caller (it must do
+		 * additional rechecking, and might end up executing a different
+		 * action entirely).
+		 */
 		if (!GetTupleForTrigger(estate, epqstate, relinfo, tupleid,
-								lockmode, oldslot, &epqslot_candidate,
-								tmresult, tmfd))
+								lockmode, oldslot, !is_merge_update,
+								&epqslot_candidate, tmresult, tmfd))
 			return false;		/* cancel the update action */
 
 		/*
@@ -3142,6 +3161,7 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo,
 							   tupleid,
 							   LockTupleExclusive,
 							   oldslot,
+							   false,
 							   NULL,
 							   NULL,
 							   NULL);
@@ -3298,6 +3318,7 @@ GetTupleForTrigger(EState *estate,
 				   ItemPointer tid,
 				   LockTupleMode lockmode,
 				   TupleTableSlot *oldslot,
+				   bool do_epq_recheck,
 				   TupleTableSlot **epqslot,
 				   TM_Result *tmresultp,
 				   TM_FailureData *tmfdp)
@@ -3357,29 +3378,30 @@ GetTupleForTrigger(EState *estate,
 				if (tmfd.traversed)
 				{
 					/*
-					 * Recheck the tuple using EPQ. For MERGE, we leave this
-					 * to the caller (it must do additional rechecking, and
-					 * might end up executing a different action entirely).
+					 * Recheck the tuple using EPQ, if requested.  Otherwise,
+					 * just return that it was concurrently updated.
 					 */
-					if (estate->es_plannedstmt->commandType == CMD_MERGE)
+					if (do_epq_recheck)
 					{
-						if (tmresultp)
-							*tmresultp = TM_Updated;
-						return false;
+						*epqslot = EvalPlanQual(epqstate,
+												relation,
+												relinfo->ri_RangeTableIndex,
+												oldslot);
+
+						/*
+						 * If PlanQual failed for updated tuple - we must not
+						 * process this tuple!
+						 */
+						if (TupIsNull(*epqslot))
+						{
+							*epqslot = NULL;
+							return false;
+						}
 					}
-
-					*epqslot = EvalPlanQual(epqstate,
-											relation,
-											relinfo->ri_RangeTableIndex,
-											oldslot);
-
-					/*
-					 * If PlanQual failed for updated tuple - we must not
-					 * process this tuple!
-					 */
-					if (TupIsNull(*epqslot))
+					else
 					{
-						*epqslot = NULL;
+						if (tmresultp)
+							*tmresultp = TM_Updated;
 						return false;
 					}
 				}
diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c
index 8a72b5e70a4..1a37737d4a2 100644
--- a/src/backend/executor/execExprInterp.c
+++ b/src/backend/executor/execExprInterp.c
@@ -5228,7 +5228,6 @@ ExecEvalJsonCoercionFinish(ExprState *state, ExprEvalStep *op)
 		 * JsonBehavior expression.
 		 */
 		jsestate->escontext.error_occurred = false;
-		jsestate->escontext.error_occurred = false;
 		jsestate->escontext.details_wanted = true;
 	}
 }
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index f3e77bda279..f098a5557cf 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -189,6 +189,7 @@ ExecSerializePlan(Plan *plan, EState *estate)
 	pstmt->permInfos = estate->es_rteperminfos;
 	pstmt->resultRelations = NIL;
 	pstmt->appendRelations = NIL;
+	pstmt->planOrigin = PLAN_STMT_INTERNAL;
 
 	/*
 	 * Transfer only parallel-safe subplans, leaving a NULL "hole" in the list
diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
index 53ddd25c42d..f262e7a66f7 100644
--- a/src/backend/executor/execReplication.c
+++ b/src/backend/executor/execReplication.c
@@ -670,7 +670,7 @@ ExecSimpleRelationUpdate(ResultRelInfo *resultRelInfo,
 		resultRelInfo->ri_TrigDesc->trig_update_before_row)
 	{
 		if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo,
-								  tid, NULL, slot, NULL, NULL))
+								  tid, NULL, slot, NULL, NULL, false))
 			skip_tuple = true;	/* "do nothing" */
 	}
 
@@ -746,7 +746,7 @@ ExecSimpleRelationDelete(ResultRelInfo *resultRelInfo,
 		resultRelInfo->ri_TrigDesc->trig_delete_before_row)
 	{
 		skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo,
-										   tid, NULL, NULL, NULL, NULL);
+										   tid, NULL, NULL, NULL, NULL, false);
 	}
 
 	if (!skip_tuple)
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 54da8e7995b..7c6c2c1f6e4 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -1474,7 +1474,8 @@ ExecDeletePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo,
 
 		return ExecBRDeleteTriggers(context->estate, context->epqstate,
 									resultRelInfo, tupleid, oldtuple,
-									epqreturnslot, result, &context->tmfd);
+									epqreturnslot, result, &context->tmfd,
+									context->mtstate->operation == CMD_MERGE);
 	}
 
 	return true;
@@ -2117,7 +2118,8 @@ ExecUpdatePrologue(ModifyTableContext *context, ResultRelInfo *resultRelInfo,
 
 		return ExecBRUpdateTriggers(context->estate, context->epqstate,
 									resultRelInfo, tupleid, oldtuple, slot,
-									result, &context->tmfd);
+									result, &context->tmfd,
+									context->mtstate->operation == CMD_MERGE);
 	}
 
 	return true;
diff --git a/src/backend/jit/llvm/Makefile b/src/backend/jit/llvm/Makefile
index e8c12060b93..68677ba42e1 100644
--- a/src/backend/jit/llvm/Makefile
+++ b/src/backend/jit/llvm/Makefile
@@ -31,7 +31,7 @@ endif
 # All files in this directory use LLVM.
 CFLAGS += $(LLVM_CFLAGS)
 CXXFLAGS += $(LLVM_CXXFLAGS)
-override CPPFLAGS := $(LLVM_CPPFLAGS) $(CPPFLAGS)
+override CPPFLAGS += $(LLVM_CPPFLAGS)
 SHLIB_LINK += $(LLVM_LIBS)
 
 # Because this module includes C++ files, we need to use a C++
diff --git a/src/backend/libpq/auth.c b/src/backend/libpq/auth.c
index 9f4d05ffbd4..4da46666439 100644
--- a/src/backend/libpq/auth.c
+++ b/src/backend/libpq/auth.c
@@ -94,8 +94,16 @@ static int	auth_peer(hbaPort *port);
 
 #define PGSQL_PAM_SERVICE "postgresql"	/* Service name passed to PAM */
 
+/* Work around original Solaris' lack of "const" in the conv_proc signature */
+#ifdef _PAM_LEGACY_NONCONST
+#define PG_PAM_CONST
+#else
+#define PG_PAM_CONST const
+#endif
+
 static int	CheckPAMAuth(Port *port, const char *user, const char *password);
-static int	pam_passwd_conv_proc(int num_msg, const struct pam_message **msg,
+static int	pam_passwd_conv_proc(int num_msg,
+								 PG_PAM_CONST struct pam_message **msg,
 								 struct pam_response **resp, void *appdata_ptr);
 
 static struct pam_conv pam_passw_conv = {
@@ -1917,7 +1925,7 @@ auth_peer(hbaPort *port)
  */
 
 static int
-pam_passwd_conv_proc(int num_msg, const struct pam_message **msg,
+pam_passwd_conv_proc(int num_msg, PG_PAM_CONST struct pam_message **msg,
 					 struct pam_response **resp, void *appdata_ptr)
 {
 	const char *passwd;
diff --git a/src/backend/libpq/pg_ident.conf.sample b/src/backend/libpq/pg_ident.conf.sample
index f5225f26cdf..8ee6c0ba315 100644
--- a/src/backend/libpq/pg_ident.conf.sample
+++ b/src/backend/libpq/pg_ident.conf.sample
@@ -13,25 +13,25 @@
 # user names to their corresponding PostgreSQL user names.  Records
 # are of the form:
 #
-# MAPNAME  SYSTEM-USERNAME  PG-USERNAME
+# MAPNAME  SYSTEM-USERNAME  DATABASE-USERNAME
 #
 # (The uppercase quantities must be replaced by actual values.)
 #
 # MAPNAME is the (otherwise freely chosen) map name that was used in
 # pg_hba.conf.  SYSTEM-USERNAME is the detected user name of the
-# client.  PG-USERNAME is the requested PostgreSQL user name.  The
-# existence of a record specifies that SYSTEM-USERNAME may connect as
-# PG-USERNAME.
+# client.  DATABASE-USERNAME is the requested PostgreSQL user name.
+# The existence of a record specifies that SYSTEM-USERNAME may connect
+# as DATABASE-USERNAME.
 #
-# If SYSTEM-USERNAME starts with a slash (/), it will be treated as a
-# regular expression.  Optionally this can contain a capture (a
-# parenthesized subexpression).  The substring matching the capture
-# will be substituted for \1 (backslash-one) if present in
-# PG-USERNAME.
+# If SYSTEM-USERNAME starts with a slash (/), the rest of it will be
+# treated as a regular expression.  Optionally this can contain a capture
+# (a parenthesized subexpression).  The substring matching the capture
+# will be substituted for \1 (backslash-one) if that appears in
+# DATABASE-USERNAME.
 #
-# PG-USERNAME can be "all", a user name, a group name prefixed with "+", or
-# a regular expression (if it starts with a slash (/)).  If it is a regular
-# expression, the substring matching with \1 has no effect.
+# DATABASE-USERNAME can be "all", a user name, a group name prefixed with "+",
+# or a regular expression (if it starts with a slash (/)).  If it is a regular
+# expression, no substitution for \1 will occur.
 #
 # Multiple maps may be specified in this file and used by pg_hba.conf.
 #
@@ -69,4 +69,4 @@
 # Put your actual configuration here
 # ----------------------------------
 
-# MAPNAME       SYSTEM-USERNAME         PG-USERNAME
+# MAPNAME       SYSTEM-USERNAME         DATABASE-USERNAME
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index e5171467de1..25f739a6a17 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -858,7 +858,6 @@ RemoveSocketFiles(void)
 		(void) unlink(sock_path);
 	}
 	/* Since we're about to exit, no need to reclaim storage */
-	sock_paths = NIL;
 }
 
 
diff --git a/src/backend/libpq/pqmq.c b/src/backend/libpq/pqmq.c
index f1a08bc32ca..5f39949a367 100644
--- a/src/backend/libpq/pqmq.c
+++ b/src/backend/libpq/pqmq.c
@@ -23,7 +23,7 @@
 #include "tcop/tcopprot.h"
 #include "utils/builtins.h"
 
-static shm_mq_handle *pq_mq_handle;
+static shm_mq_handle *pq_mq_handle = NULL;
 static bool pq_mq_busy = false;
 static pid_t pq_mq_parallel_leader_pid = 0;
 static ProcNumber pq_mq_parallel_leader_proc_number = INVALID_PROC_NUMBER;
@@ -66,7 +66,11 @@ pq_redirect_to_shm_mq(dsm_segment *seg, shm_mq_handle *mqh)
 static void
 pq_cleanup_redirect_to_shm_mq(dsm_segment *seg, Datum arg)
 {
-	pq_mq_handle = NULL;
+	if (pq_mq_handle != NULL)
+	{
+		pfree(pq_mq_handle);
+		pq_mq_handle = NULL;
+	}
 	whereToSendOutput = DestNone;
 }
 
@@ -131,8 +135,11 @@ mq_putmessage(char msgtype, const char *s, size_t len)
 	if (pq_mq_busy)
 	{
 		if (pq_mq_handle != NULL)
+		{
 			shm_mq_detach(pq_mq_handle);
-		pq_mq_handle = NULL;
+			pfree(pq_mq_handle);
+			pq_mq_handle = NULL;
+		}
 		return EOF;
 	}
 
@@ -152,8 +159,6 @@ mq_putmessage(char msgtype, const char *s, size_t len)
 	iov[1].data = s;
 	iov[1].len = len;
 
-	Assert(pq_mq_handle != NULL);
-
 	for (;;)
 	{
 		/*
@@ -161,6 +166,7 @@ mq_putmessage(char msgtype, const char *s, size_t len)
 		 * that the shared memory value is updated before we send the parallel
 		 * message signal right after this.
 		 */
+		Assert(pq_mq_handle != NULL);
 		result = shm_mq_sendv(pq_mq_handle, iov, 2, true, true);
 
 		if (pq_mq_parallel_leader_pid != 0)
diff --git a/src/backend/main/main.c b/src/backend/main/main.c
index 7d63cf94a6b..bdcb5e4f261 100644
--- a/src/backend/main/main.c
+++ b/src/backend/main/main.c
@@ -125,13 +125,17 @@ main(int argc, char *argv[])
 	set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("postgres"));
 
 	/*
-	 * In the postmaster, absorb the environment values for LC_COLLATE and
-	 * LC_CTYPE.  Individual backends will change these later to settings
-	 * taken from pg_database, but the postmaster cannot do that.  If we leave
-	 * these set to "C" then message localization might not work well in the
-	 * postmaster.
+	 * Collation is handled by pg_locale.c, and the behavior is dependent on
+	 * the provider. strcoll(), etc., should not be called directly.
+	 */
+	init_locale("LC_COLLATE", LC_COLLATE, "C");
+
+	/*
+	 * In the postmaster, absorb the environment value for LC_CTYPE.
+	 * Individual backends will change it later to pg_database.datctype, but
+	 * the postmaster cannot do that.  If we leave it set to "C" then message
+	 * localization might not work well in the postmaster.
 	 */
-	init_locale("LC_COLLATE", LC_COLLATE, "");
 	init_locale("LC_CTYPE", LC_CTYPE, "");
 
 	/*
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 1f04a2c182c..344a3188317 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -2572,13 +2572,13 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath,
 	Cost		input_startup_cost = mpath->subpath->startup_cost;
 	Cost		input_total_cost = mpath->subpath->total_cost;
 	double		tuples = mpath->subpath->rows;
-	double		calls = mpath->calls;
+	Cardinality est_calls = mpath->est_calls;
 	int			width = mpath->subpath->pathtarget->width;
 
 	double		hash_mem_bytes;
 	double		est_entry_bytes;
-	double		est_cache_entries;
-	double		ndistinct;
+	Cardinality est_cache_entries;
+	Cardinality ndistinct;
 	double		evict_ratio;
 	double		hit_ratio;
 	Cost		startup_cost;
@@ -2604,7 +2604,7 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath,
 	est_cache_entries = floor(hash_mem_bytes / est_entry_bytes);
 
 	/* estimate on the distinct number of parameter values */
-	ndistinct = estimate_num_groups(root, mpath->param_exprs, calls, NULL,
+	ndistinct = estimate_num_groups(root, mpath->param_exprs, est_calls, NULL,
 									&estinfo);
 
 	/*
@@ -2616,7 +2616,10 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath,
 	 * certainly mean a MemoizePath will never survive add_path().
 	 */
 	if ((estinfo.flags & SELFLAG_USED_DEFAULT) != 0)
-		ndistinct = calls;
+		ndistinct = est_calls;
+
+	/* Remember the ndistinct estimate for EXPLAIN */
+	mpath->est_unique_keys = ndistinct;
 
 	/*
 	 * Since we've already estimated the maximum number of entries we can
@@ -2644,9 +2647,12 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath,
 	 * must look at how many scans are estimated in total for this node and
 	 * how many of those scans we expect to get a cache hit.
 	 */
-	hit_ratio = ((calls - ndistinct) / calls) *
+	hit_ratio = ((est_calls - ndistinct) / est_calls) *
 		(est_cache_entries / Max(ndistinct, est_cache_entries));
 
+	/* Remember the hit ratio estimate for EXPLAIN */
+	mpath->est_hit_ratio = hit_ratio;
+
 	Assert(hit_ratio >= 0 && hit_ratio <= 1.0);
 
 	/*
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 8a9f1d7a943..bfefc7dbea1 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -284,7 +284,10 @@ static Material *make_material(Plan *lefttree);
 static Memoize *make_memoize(Plan *lefttree, Oid *hashoperators,
 							 Oid *collations, List *param_exprs,
 							 bool singlerow, bool binary_mode,
-							 uint32 est_entries, Bitmapset *keyparamids);
+							 uint32 est_entries, Bitmapset *keyparamids,
+							 Cardinality est_calls,
+							 Cardinality est_unique_keys,
+							 double est_hit_ratio);
 static WindowAgg *make_windowagg(List *tlist, WindowClause *wc,
 								 int partNumCols, AttrNumber *partColIdx, Oid *partOperators, Oid *partCollations,
 								 int ordNumCols, AttrNumber *ordColIdx, Oid *ordOperators, Oid *ordCollations,
@@ -1753,7 +1756,8 @@ create_memoize_plan(PlannerInfo *root, MemoizePath *best_path, int flags)
 
 	plan = make_memoize(subplan, operators, collations, param_exprs,
 						best_path->singlerow, best_path->binary_mode,
-						best_path->est_entries, keyparamids);
+						best_path->est_entries, keyparamids, best_path->est_calls,
+						best_path->est_unique_keys, best_path->est_hit_ratio);
 
 	copy_generic_path_info(&plan->plan, (Path *) best_path);
 
@@ -6749,7 +6753,9 @@ materialize_finished_plan(Plan *subplan)
 static Memoize *
 make_memoize(Plan *lefttree, Oid *hashoperators, Oid *collations,
 			 List *param_exprs, bool singlerow, bool binary_mode,
-			 uint32 est_entries, Bitmapset *keyparamids)
+			 uint32 est_entries, Bitmapset *keyparamids,
+			 Cardinality est_calls, Cardinality est_unique_keys,
+			 double est_hit_ratio)
 {
 	Memoize    *node = makeNode(Memoize);
 	Plan	   *plan = &node->plan;
@@ -6767,6 +6773,9 @@ make_memoize(Plan *lefttree, Oid *hashoperators, Oid *collations,
 	node->binary_mode = binary_mode;
 	node->est_entries = est_entries;
 	node->keyparamids = keyparamids;
+	node->est_calls = est_calls;
+	node->est_unique_keys = est_unique_keys;
+	node->est_hit_ratio = est_hit_ratio;
 
 	return node;
 }
diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c
index 01804b085b3..3e3fec89252 100644
--- a/src/backend/optimizer/plan/initsplan.c
+++ b/src/backend/optimizer/plan/initsplan.c
@@ -3048,36 +3048,16 @@ add_base_clause_to_rel(PlannerInfo *root, Index relid,
  * expr_is_nonnullable
  *	  Check to see if the Expr cannot be NULL
  *
- * If the Expr is a simple Var that is defined NOT NULL and meanwhile is not
- * nulled by any outer joins, then we can know that it cannot be NULL.
+ * Currently we only support simple Vars.
  */
 static bool
 expr_is_nonnullable(PlannerInfo *root, Expr *expr)
 {
-	RelOptInfo *rel;
-	Var		   *var;
-
 	/* For now only check simple Vars */
 	if (!IsA(expr, Var))
 		return false;
 
-	var = (Var *) expr;
-
-	/* could the Var be nulled by any outer joins? */
-	if (!bms_is_empty(var->varnullingrels))
-		return false;
-
-	/* system columns cannot be NULL */
-	if (var->varattno < 0)
-		return true;
-
-	/* is the column defined NOT NULL? */
-	rel = find_base_rel(root, var->varno);
-	if (var->varattno > 0 &&
-		bms_is_member(var->varattno, rel->notnullattnums))
-		return true;
-
-	return false;
+	return var_is_nonnullable(root, (Var *) expr, true);
 }
 
 /*
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 549aedcfa99..d59d6e4c6a0 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -342,6 +342,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions,
 	glob->transientPlan = false;
 	glob->dependsOnRole = false;
 	glob->partition_directory = NULL;
+	glob->rel_notnullatts_hash = NULL;
 
 	/*
 	 * Assess whether it's feasible to use parallel mode for this query. We
@@ -557,6 +558,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions,
 
 	result->commandType = parse->commandType;
 	result->queryId = parse->queryId;
+	result->planOrigin = PLAN_STMT_STANDARD;
 	result->hasReturning = (parse->returningList != NIL);
 	result->hasModifyingCTE = parse->hasModifyingCTE;
 	result->canSetTag = parse->canSetTag;
@@ -721,6 +723,18 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root,
 	transform_MERGE_to_join(parse);
 
 	/*
+	 * Scan the rangetable for relation RTEs and retrieve the necessary
+	 * catalog information for each relation.  Using this information, clear
+	 * the inh flag for any relation that has no children, collect not-null
+	 * attribute numbers for any relation that has column not-null
+	 * constraints, and expand virtual generated columns for any relation that
+	 * contains them.  Note that this step does not descend into sublinks and
+	 * subqueries; if we pull up any sublinks or subqueries below, their
+	 * relation RTEs are processed just before pulling them up.
+	 */
+	parse = root->parse = preprocess_relation_rtes(root);
+
+	/*
 	 * If the FROM clause is empty, replace it with a dummy RTE_RESULT RTE, so
 	 * that we don't need so many special cases to deal with that situation.
 	 */
@@ -744,14 +758,6 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root,
 	preprocess_function_rtes(root);
 
 	/*
-	 * Scan the rangetable for relations with virtual generated columns, and
-	 * replace all Var nodes in the query that reference these columns with
-	 * the generation expressions.  Recursion issues here are handled in the
-	 * same way as for SubLinks.
-	 */
-	parse = root->parse = expand_virtual_generated_columns(root);
-
-	/*
 	 * Check to see if any subqueries in the jointree can be merged into this
 	 * query.
 	 */
@@ -787,23 +793,6 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root,
 
 		switch (rte->rtekind)
 		{
-			case RTE_RELATION:
-				if (rte->inh)
-				{
-					/*
-					 * Check to see if the relation actually has any children;
-					 * if not, clear the inh flag so we can treat it as a
-					 * plain base relation.
-					 *
-					 * Note: this could give a false-positive result, if the
-					 * rel once had children but no longer does.  We used to
-					 * be able to clear rte->inh later on when we discovered
-					 * that, but no more; we have to handle such cases as
-					 * full-fledged inheritance.
-					 */
-					rte->inh = has_subclass(rte->relid);
-				}
-				break;
 			case RTE_JOIN:
 				root->hasJoinRTEs = true;
 				if (IS_OUTER_JOIN(rte->jointype))
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index e7cb3fede66..d71ed958e31 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -1454,6 +1454,7 @@ convert_EXISTS_sublink_to_join(PlannerInfo *root, SubLink *sublink,
 	Query	   *parse = root->parse;
 	Query	   *subselect = (Query *) sublink->subselect;
 	Node	   *whereClause;
+	PlannerInfo subroot;
 	int			rtoffset;
 	int			varno;
 	Relids		clause_varnos;
@@ -1516,6 +1517,35 @@ convert_EXISTS_sublink_to_join(PlannerInfo *root, SubLink *sublink,
 		return NULL;
 
 	/*
+	 * Scan the rangetable for relation RTEs and retrieve the necessary
+	 * catalog information for each relation.  Using this information, clear
+	 * the inh flag for any relation that has no children, collect not-null
+	 * attribute numbers for any relation that has column not-null
+	 * constraints, and expand virtual generated columns for any relation that
+	 * contains them.
+	 *
+	 * Note: we construct up an entirely dummy PlannerInfo for use here.  This
+	 * is fine because only the "glob" and "parse" links will be used in this
+	 * case.
+	 *
+	 * Note: we temporarily assign back the WHERE clause so that any virtual
+	 * generated column references within it can be expanded.  It should be
+	 * separated out again afterward.
+	 */
+	MemSet(&subroot, 0, sizeof(subroot));
+	subroot.type = T_PlannerInfo;
+	subroot.glob = root->glob;
+	subroot.parse = subselect;
+	subselect->jointree->quals = whereClause;
+	subselect = preprocess_relation_rtes(&subroot);
+
+	/*
+	 * Now separate out the WHERE clause again.
+	 */
+	whereClause = subselect->jointree->quals;
+	subselect->jointree->quals = NULL;
+
+	/*
 	 * The subquery must have a nonempty jointree, but we can make it so.
 	 */
 	replace_empty_jointree(subselect);
@@ -1732,6 +1762,7 @@ convert_EXISTS_to_ANY(PlannerInfo *root, Query *subselect,
 					  Node **testexpr, List **paramIds)
 {
 	Node	   *whereClause;
+	PlannerInfo subroot;
 	List	   *leftargs,
 			   *rightargs,
 			   *opids,
@@ -1791,12 +1822,15 @@ convert_EXISTS_to_ANY(PlannerInfo *root, Query *subselect,
 	 * parent aliases were flattened already, and we're not going to pull any
 	 * child Vars (of any description) into the parent.
 	 *
-	 * Note: passing the parent's root to eval_const_expressions is
-	 * technically wrong, but we can get away with it since only the
-	 * boundParams (if any) are used, and those would be the same in a
-	 * subroot.
-	 */
-	whereClause = eval_const_expressions(root, whereClause);
+	 * Note: we construct up an entirely dummy PlannerInfo to pass to
+	 * eval_const_expressions.  This is fine because only the "glob" and
+	 * "parse" links are used by eval_const_expressions.
+	 */
+	MemSet(&subroot, 0, sizeof(subroot));
+	subroot.type = T_PlannerInfo;
+	subroot.glob = root->glob;
+	subroot.parse = subselect;
+	whereClause = eval_const_expressions(&subroot, whereClause);
 	whereClause = (Node *) canonicalize_qual((Expr *) whereClause, false);
 	whereClause = (Node *) make_ands_implicit((Expr *) whereClause);
 
diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c
index 87dc6f56b57..35e8d3c183b 100644
--- a/src/backend/optimizer/prep/prepjointree.c
+++ b/src/backend/optimizer/prep/prepjointree.c
@@ -4,10 +4,10 @@
  *	  Planner preprocessing for subqueries and join tree manipulation.
  *
  * NOTE: the intended sequence for invoking these operations is
+ *		preprocess_relation_rtes
  *		replace_empty_jointree
  *		pull_up_sublinks
  *		preprocess_function_rtes
- *		expand_virtual_generated_columns
  *		pull_up_subqueries
  *		flatten_simple_union_all
  *		do expression preprocessing (including flattening JOIN alias vars)
@@ -36,6 +36,7 @@
 #include "optimizer/clauses.h"
 #include "optimizer/optimizer.h"
 #include "optimizer/placeholder.h"
+#include "optimizer/plancat.h"
 #include "optimizer/prep.h"
 #include "optimizer/subselect.h"
 #include "optimizer/tlist.h"
@@ -102,6 +103,9 @@ typedef struct reduce_outer_joins_partial_state
 	Relids		unreduced_side; /* relids in its still-nullable side */
 } reduce_outer_joins_partial_state;
 
+static Query *expand_virtual_generated_columns(PlannerInfo *root, Query *parse,
+											   RangeTblEntry *rte, int rt_index,
+											   Relation relation);
 static Node *pull_up_sublinks_jointree_recurse(PlannerInfo *root, Node *jtnode,
 											   Relids *relids);
 static Node *pull_up_sublinks_qual_recurse(PlannerInfo *root, Node *node,
@@ -393,6 +397,181 @@ transform_MERGE_to_join(Query *parse)
 }
 
 /*
+ * preprocess_relation_rtes
+ *		Do the preprocessing work for any relation RTEs in the FROM clause.
+ *
+ * This scans the rangetable for relation RTEs and retrieves the necessary
+ * catalog information for each relation.  Using this information, it clears
+ * the inh flag for any relation that has no children, collects not-null
+ * attribute numbers for any relation that has column not-null constraints, and
+ * expands virtual generated columns for any relation that contains them.
+ *
+ * Note that expanding virtual generated columns may cause the query tree to
+ * have new copies of rangetable entries.  Therefore, we have to use list_nth
+ * instead of foreach when iterating over the query's rangetable.
+ *
+ * Returns a modified copy of the query tree, if any relations with virtual
+ * generated columns are present.
+ */
+Query *
+preprocess_relation_rtes(PlannerInfo *root)
+{
+	Query	   *parse = root->parse;
+	int			rtable_size;
+	int			rt_index;
+
+	rtable_size = list_length(parse->rtable);
+
+	for (rt_index = 0; rt_index < rtable_size; rt_index++)
+	{
+		RangeTblEntry *rte = rt_fetch(rt_index + 1, parse->rtable);
+		Relation	relation;
+
+		/* We only care about relation RTEs. */
+		if (rte->rtekind != RTE_RELATION)
+			continue;
+
+		/*
+		 * We need not lock the relation since it was already locked by the
+		 * rewriter.
+		 */
+		relation = table_open(rte->relid, NoLock);
+
+		/*
+		 * Check to see if the relation actually has any children; if not,
+		 * clear the inh flag so we can treat it as a plain base relation.
+		 *
+		 * Note: this could give a false-positive result, if the rel once had
+		 * children but no longer does.  We used to be able to clear rte->inh
+		 * later on when we discovered that, but no more; we have to handle
+		 * such cases as full-fledged inheritance.
+		 */
+		if (rte->inh)
+			rte->inh = relation->rd_rel->relhassubclass;
+
+		/*
+		 * Check to see if the relation has any column not-null constraints;
+		 * if so, retrieve the constraint information and store it in a
+		 * relation OID based hash table.
+		 */
+		get_relation_notnullatts(root, relation);
+
+		/*
+		 * Check to see if the relation has any virtual generated columns; if
+		 * so, replace all Var nodes in the query that reference these columns
+		 * with the generation expressions.
+		 */
+		parse = expand_virtual_generated_columns(root, parse,
+												 rte, rt_index + 1,
+												 relation);
+
+		table_close(relation, NoLock);
+	}
+
+	return parse;
+}
+
+/*
+ * expand_virtual_generated_columns
+ *		Expand virtual generated columns for the given relation.
+ *
+ * This checks whether the given relation has any virtual generated columns,
+ * and if so, replaces all Var nodes in the query that reference those columns
+ * with their generation expressions.
+ *
+ * Returns a modified copy of the query tree if the relation contains virtual
+ * generated columns.
+ */
+static Query *
+expand_virtual_generated_columns(PlannerInfo *root, Query *parse,
+								 RangeTblEntry *rte, int rt_index,
+								 Relation relation)
+{
+	TupleDesc	tupdesc;
+
+	/* Only normal relations can have virtual generated columns */
+	Assert(rte->rtekind == RTE_RELATION);
+
+	tupdesc = RelationGetDescr(relation);
+	if (tupdesc->constr && tupdesc->constr->has_generated_virtual)
+	{
+		List	   *tlist = NIL;
+		pullup_replace_vars_context rvcontext;
+
+		for (int i = 0; i < tupdesc->natts; i++)
+		{
+			Form_pg_attribute attr = TupleDescAttr(tupdesc, i);
+			TargetEntry *tle;
+
+			if (attr->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL)
+			{
+				Node	   *defexpr;
+
+				defexpr = build_generation_expression(relation, i + 1);
+				ChangeVarNodes(defexpr, 1, rt_index, 0);
+
+				tle = makeTargetEntry((Expr *) defexpr, i + 1, 0, false);
+				tlist = lappend(tlist, tle);
+			}
+			else
+			{
+				Var		   *var;
+
+				var = makeVar(rt_index,
+							  i + 1,
+							  attr->atttypid,
+							  attr->atttypmod,
+							  attr->attcollation,
+							  0);
+
+				tle = makeTargetEntry((Expr *) var, i + 1, 0, false);
+				tlist = lappend(tlist, tle);
+			}
+		}
+
+		Assert(list_length(tlist) > 0);
+		Assert(!rte->lateral);
+
+		/*
+		 * The relation's targetlist items are now in the appropriate form to
+		 * insert into the query, except that we may need to wrap them in
+		 * PlaceHolderVars.  Set up required context data for
+		 * pullup_replace_vars.
+		 */
+		rvcontext.root = root;
+		rvcontext.targetlist = tlist;
+		rvcontext.target_rte = rte;
+		rvcontext.result_relation = parse->resultRelation;
+		/* won't need these values */
+		rvcontext.relids = NULL;
+		rvcontext.nullinfo = NULL;
+		/* pass NULL for outer_hasSubLinks */
+		rvcontext.outer_hasSubLinks = NULL;
+		rvcontext.varno = rt_index;
+		/* this flag will be set below, if needed */
+		rvcontext.wrap_option = REPLACE_WRAP_NONE;
+		/* initialize cache array with indexes 0 .. length(tlist) */
+		rvcontext.rv_cache = palloc0((list_length(tlist) + 1) *
+									 sizeof(Node *));
+
+		/*
+		 * If the query uses grouping sets, we need a PlaceHolderVar for each
+		 * expression of the relation's targetlist items.  (See comments in
+		 * pull_up_simple_subquery().)
+		 */
+		if (parse->groupingSets)
+			rvcontext.wrap_option = REPLACE_WRAP_ALL;
+
+		/*
+		 * Apply pullup variable replacement throughout the query tree.
+		 */
+		parse = (Query *) pullup_replace_vars((Node *) parse, &rvcontext);
+	}
+
+	return parse;
+}
+
+/*
  * replace_empty_jointree
  *		If the Query's jointree is empty, replace it with a dummy RTE_RESULT
  *		relation.
@@ -950,128 +1129,6 @@ preprocess_function_rtes(PlannerInfo *root)
 }
 
 /*
- * expand_virtual_generated_columns
- *		Expand all virtual generated column references in a query.
- *
- * This scans the rangetable for relations with virtual generated columns, and
- * replaces all Var nodes in the query that reference these columns with the
- * generation expressions.  Note that we do not descend into subqueries; that
- * is taken care of when the subqueries are planned.
- *
- * This has to be done after we have pulled up any SubLinks within the query's
- * quals; otherwise any virtual generated column references within the SubLinks
- * that should be transformed into joins wouldn't get expanded.
- *
- * Returns a modified copy of the query tree, if any relations with virtual
- * generated columns are present.
- */
-Query *
-expand_virtual_generated_columns(PlannerInfo *root)
-{
-	Query	   *parse = root->parse;
-	int			rt_index;
-	ListCell   *lc;
-
-	rt_index = 0;
-	foreach(lc, parse->rtable)
-	{
-		RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
-		Relation	rel;
-		TupleDesc	tupdesc;
-
-		++rt_index;
-
-		/*
-		 * Only normal relations can have virtual generated columns.
-		 */
-		if (rte->rtekind != RTE_RELATION)
-			continue;
-
-		rel = table_open(rte->relid, NoLock);
-
-		tupdesc = RelationGetDescr(rel);
-		if (tupdesc->constr && tupdesc->constr->has_generated_virtual)
-		{
-			List	   *tlist = NIL;
-			pullup_replace_vars_context rvcontext;
-
-			for (int i = 0; i < tupdesc->natts; i++)
-			{
-				Form_pg_attribute attr = TupleDescAttr(tupdesc, i);
-				TargetEntry *tle;
-
-				if (attr->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL)
-				{
-					Node	   *defexpr;
-
-					defexpr = build_generation_expression(rel, i + 1);
-					ChangeVarNodes(defexpr, 1, rt_index, 0);
-
-					tle = makeTargetEntry((Expr *) defexpr, i + 1, 0, false);
-					tlist = lappend(tlist, tle);
-				}
-				else
-				{
-					Var		   *var;
-
-					var = makeVar(rt_index,
-								  i + 1,
-								  attr->atttypid,
-								  attr->atttypmod,
-								  attr->attcollation,
-								  0);
-
-					tle = makeTargetEntry((Expr *) var, i + 1, 0, false);
-					tlist = lappend(tlist, tle);
-				}
-			}
-
-			Assert(list_length(tlist) > 0);
-			Assert(!rte->lateral);
-
-			/*
-			 * The relation's targetlist items are now in the appropriate form
-			 * to insert into the query, except that we may need to wrap them
-			 * in PlaceHolderVars.  Set up required context data for
-			 * pullup_replace_vars.
-			 */
-			rvcontext.root = root;
-			rvcontext.targetlist = tlist;
-			rvcontext.target_rte = rte;
-			rvcontext.result_relation = parse->resultRelation;
-			/* won't need these values */
-			rvcontext.relids = NULL;
-			rvcontext.nullinfo = NULL;
-			/* pass NULL for outer_hasSubLinks */
-			rvcontext.outer_hasSubLinks = NULL;
-			rvcontext.varno = rt_index;
-			/* this flag will be set below, if needed */
-			rvcontext.wrap_option = REPLACE_WRAP_NONE;
-			/* initialize cache array with indexes 0 .. length(tlist) */
-			rvcontext.rv_cache = palloc0((list_length(tlist) + 1) *
-										 sizeof(Node *));
-
-			/*
-			 * If the query uses grouping sets, we need a PlaceHolderVar for
-			 * each expression of the relation's targetlist items.  (See
-			 * comments in pull_up_simple_subquery().)
-			 */
-			if (parse->groupingSets)
-				rvcontext.wrap_option = REPLACE_WRAP_ALL;
-
-			/*
-			 * Apply pullup variable replacement throughout the query tree.
-			 */
-			parse = (Query *) pullup_replace_vars((Node *) parse, &rvcontext);
-		}
-
-		table_close(rel, NoLock);
-	}
-
-	return parse;
-}
-
-/*
  * pull_up_subqueries
  *		Look for subqueries in the rangetable that can be pulled up into
  *		the parent query.  If the subquery has no special features like
@@ -1334,6 +1391,16 @@ pull_up_simple_subquery(PlannerInfo *root, Node *jtnode, RangeTblEntry *rte,
 	Assert(subquery->cteList == NIL);
 
 	/*
+	 * Scan the rangetable for relation RTEs and retrieve the necessary
+	 * catalog information for each relation.  Using this information, clear
+	 * the inh flag for any relation that has no children, collect not-null
+	 * attribute numbers for any relation that has column not-null
+	 * constraints, and expand virtual generated columns for any relation that
+	 * contains them.
+	 */
+	subquery = subroot->parse = preprocess_relation_rtes(subroot);
+
+	/*
 	 * If the FROM clause is empty, replace it with a dummy RTE_RESULT RTE, so
 	 * that we don't need so many special cases to deal with that situation.
 	 */
@@ -1353,13 +1420,6 @@ pull_up_simple_subquery(PlannerInfo *root, Node *jtnode, RangeTblEntry *rte,
 	preprocess_function_rtes(subroot);
 
 	/*
-	 * Scan the rangetable for relations with virtual generated columns, and
-	 * replace all Var nodes in the query that reference these columns with
-	 * the generation expressions.
-	 */
-	subquery = subroot->parse = expand_virtual_generated_columns(subroot);
-
-	/*
 	 * Recursively pull up the subquery's subqueries, so that
 	 * pull_up_subqueries' processing is complete for its jointree and
 	 * rangetable.
diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c
index f45131c34c5..6f0b338d2cd 100644
--- a/src/backend/optimizer/util/clauses.c
+++ b/src/backend/optimizer/util/clauses.c
@@ -20,6 +20,7 @@
 #include "postgres.h"
 
 #include "access/htup_details.h"
+#include "catalog/pg_class.h"
 #include "catalog/pg_language.h"
 #include "catalog/pg_operator.h"
 #include "catalog/pg_proc.h"
@@ -36,6 +37,7 @@
 #include "optimizer/clauses.h"
 #include "optimizer/cost.h"
 #include "optimizer/optimizer.h"
+#include "optimizer/pathnode.h"
 #include "optimizer/plancat.h"
 #include "optimizer/planmain.h"
 #include "parser/analyze.h"
@@ -43,6 +45,7 @@
 #include "parser/parse_collate.h"
 #include "parser/parse_func.h"
 #include "parser/parse_oper.h"
+#include "parser/parsetree.h"
 #include "rewrite/rewriteHandler.h"
 #include "rewrite/rewriteManip.h"
 #include "tcop/tcopprot.h"
@@ -2242,7 +2245,8 @@ rowtype_field_matches(Oid rowtypeid, int fieldnum,
  * only operators and functions that are reasonable to try to execute.
  *
  * NOTE: "root" can be passed as NULL if the caller never wants to do any
- * Param substitutions nor receive info about inlined functions.
+ * Param substitutions nor receive info about inlined functions nor reduce
+ * NullTest for Vars to constant true or constant false.
  *
  * NOTE: the planner assumes that this will always flatten nested AND and
  * OR clauses into N-argument form.  See comments in prepqual.c.
@@ -3544,6 +3548,31 @@ eval_const_expressions_mutator(Node *node,
 
 					return makeBoolConst(result, false);
 				}
+				if (!ntest->argisrow && arg && IsA(arg, Var) && context->root)
+				{
+					Var		   *varg = (Var *) arg;
+					bool		result;
+
+					if (var_is_nonnullable(context->root, varg, false))
+					{
+						switch (ntest->nulltesttype)
+						{
+							case IS_NULL:
+								result = false;
+								break;
+							case IS_NOT_NULL:
+								result = true;
+								break;
+							default:
+								elog(ERROR, "unrecognized nulltesttype: %d",
+									 (int) ntest->nulltesttype);
+								result = false; /* keep compiler quiet */
+								break;
+						}
+
+						return makeBoolConst(result, false);
+					}
+				}
 
 				newntest = makeNode(NullTest);
 				newntest->arg = (Expr *) arg;
@@ -4163,6 +4192,67 @@ simplify_function(Oid funcid, Oid result_type, int32 result_typmod,
 }
 
 /*
+ * var_is_nonnullable: check to see if the Var cannot be NULL
+ *
+ * If the Var is defined NOT NULL and meanwhile is not nulled by any outer
+ * joins or grouping sets, then we can know that it cannot be NULL.
+ *
+ * use_rel_info indicates whether the corresponding RelOptInfo is available for
+ * use.
+ */
+bool
+var_is_nonnullable(PlannerInfo *root, Var *var, bool use_rel_info)
+{
+	Relids		notnullattnums = NULL;
+
+	Assert(IsA(var, Var));
+
+	/* skip upper-level Vars */
+	if (var->varlevelsup != 0)
+		return false;
+
+	/* could the Var be nulled by any outer joins or grouping sets? */
+	if (!bms_is_empty(var->varnullingrels))
+		return false;
+
+	/* system columns cannot be NULL */
+	if (var->varattno < 0)
+		return true;
+
+	/*
+	 * Check if the Var is defined as NOT NULL.  We retrieve the column NOT
+	 * NULL constraint information from the corresponding RelOptInfo if it is
+	 * available; otherwise, we search the hash table for this information.
+	 */
+	if (use_rel_info)
+	{
+		RelOptInfo *rel = find_base_rel(root, var->varno);
+
+		notnullattnums = rel->notnullattnums;
+	}
+	else
+	{
+		RangeTblEntry *rte = planner_rt_fetch(var->varno, root);
+
+		/*
+		 * We must skip inheritance parent tables, as some child tables may
+		 * have a NOT NULL constraint for a column while others may not.  This
+		 * cannot happen with partitioned tables, though.
+		 */
+		if (rte->inh && rte->relkind != RELKIND_PARTITIONED_TABLE)
+			return false;
+
+		notnullattnums = find_relation_notnullatts(root, rte->relid);
+	}
+
+	if (var->varattno > 0 &&
+		bms_is_member(var->varattno, notnullattnums))
+		return true;
+
+	return false;
+}
+
+/*
  * expand_function_arguments: convert named-notation args to positional args
  * and/or insert default args, as needed
  *
diff --git a/src/backend/optimizer/util/inherit.c b/src/backend/optimizer/util/inherit.c
index 17e51cd75d7..30d158069e3 100644
--- a/src/backend/optimizer/util/inherit.c
+++ b/src/backend/optimizer/util/inherit.c
@@ -466,8 +466,7 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte,
 								Index *childRTindex_p)
 {
 	Query	   *parse = root->parse;
-	Oid			parentOID PG_USED_FOR_ASSERTS_ONLY =
-		RelationGetRelid(parentrel);
+	Oid			parentOID = RelationGetRelid(parentrel);
 	Oid			childOID = RelationGetRelid(childrel);
 	RangeTblEntry *childrte;
 	Index		childRTindex;
@@ -514,6 +513,13 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte,
 	*childRTindex_p = childRTindex;
 
 	/*
+	 * Retrieve column not-null constraint information for the child relation
+	 * if its relation OID is different from the parent's.
+	 */
+	if (childOID != parentOID)
+		get_relation_notnullatts(root, childrel);
+
+	/*
 	 * Build an AppendRelInfo struct for each parent/child pair.
 	 */
 	appinfo = make_append_rel_info(parentrel, childrel,
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 9cc602788ea..a4c5867cdcb 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1689,7 +1689,7 @@ create_material_path(RelOptInfo *rel, Path *subpath)
 MemoizePath *
 create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 					List *param_exprs, List *hash_operators,
-					bool singlerow, bool binary_mode, double calls)
+					bool singlerow, bool binary_mode, Cardinality est_calls)
 {
 	MemoizePath *pathnode = makeNode(MemoizePath);
 
@@ -1710,7 +1710,6 @@ create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 	pathnode->param_exprs = param_exprs;
 	pathnode->singlerow = singlerow;
 	pathnode->binary_mode = binary_mode;
-	pathnode->calls = clamp_row_est(calls);
 
 	/*
 	 * For now we set est_entries to 0.  cost_memoize_rescan() does all the
@@ -1720,6 +1719,12 @@ create_memoize_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 	 */
 	pathnode->est_entries = 0;
 
+	pathnode->est_calls = clamp_row_est(est_calls);
+
+	/* These will also be set later in cost_memoize_rescan() */
+	pathnode->est_unique_keys = 0.0;
+	pathnode->est_hit_ratio = 0.0;
+
 	/* we should not generate this path type when enable_memoize=false */
 	Assert(enable_memoize);
 	pathnode->path.disabled_nodes = subpath->disabled_nodes;
@@ -4259,7 +4264,7 @@ reparameterize_path(PlannerInfo *root, Path *path,
 													mpath->hash_operators,
 													mpath->singlerow,
 													mpath->binary_mode,
-													mpath->calls);
+													mpath->est_calls);
 			}
 		default:
 			break;
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 59233b64730..c6a58afc5e5 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -59,6 +59,12 @@ int			constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION;
 /* Hook for plugins to get control in get_relation_info() */
 get_relation_info_hook_type get_relation_info_hook = NULL;
 
+typedef struct NotnullHashEntry
+{
+	Oid			relid;			/* OID of the relation */
+	Relids		notnullattnums; /* attnums of NOT NULL columns */
+} NotnullHashEntry;
+
 
 static void get_relation_foreign_keys(PlannerInfo *root, RelOptInfo *rel,
 									  Relation relation, bool inhparent);
@@ -172,27 +178,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	 * RangeTblEntry does get populated.
 	 */
 	if (!inhparent || relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
-	{
-		for (int i = 0; i < relation->rd_att->natts; i++)
-		{
-			CompactAttribute *attr = TupleDescCompactAttr(relation->rd_att, i);
-
-			Assert(attr->attnullability != ATTNULLABLE_UNKNOWN);
-
-			if (attr->attnullability == ATTNULLABLE_VALID)
-			{
-				rel->notnullattnums = bms_add_member(rel->notnullattnums,
-													 i + 1);
-
-				/*
-				 * Per RemoveAttributeById(), dropped columns will have their
-				 * attnotnull unset, so we needn't check for dropped columns
-				 * in the above condition.
-				 */
-				Assert(!attr->attisdropped);
-			}
-		}
-	}
+		rel->notnullattnums = find_relation_notnullatts(root, relationObjectId);
 
 	/*
 	 * Estimate relation size --- unless it's an inheritance parent, in which
@@ -684,6 +670,105 @@ get_relation_foreign_keys(PlannerInfo *root, RelOptInfo *rel,
 }
 
 /*
+ * get_relation_notnullatts -
+ *	  Retrieves column not-null constraint information for a given relation.
+ *
+ * We do this while we have the relcache entry open, and store the column
+ * not-null constraint information in a hash table based on the relation OID.
+ */
+void
+get_relation_notnullatts(PlannerInfo *root, Relation relation)
+{
+	Oid			relid = RelationGetRelid(relation);
+	NotnullHashEntry *hentry;
+	bool		found;
+	Relids		notnullattnums = NULL;
+
+	/* bail out if the relation has no not-null constraints */
+	if (relation->rd_att->constr == NULL ||
+		!relation->rd_att->constr->has_not_null)
+		return;
+
+	/* create the hash table if it hasn't been created yet */
+	if (root->glob->rel_notnullatts_hash == NULL)
+	{
+		HTAB	   *hashtab;
+		HASHCTL		hash_ctl;
+
+		hash_ctl.keysize = sizeof(Oid);
+		hash_ctl.entrysize = sizeof(NotnullHashEntry);
+		hash_ctl.hcxt = CurrentMemoryContext;
+
+		hashtab = hash_create("Relation NOT NULL attnums",
+							  64L,	/* arbitrary initial size */
+							  &hash_ctl,
+							  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+		root->glob->rel_notnullatts_hash = hashtab;
+	}
+
+	/*
+	 * Create a hash entry for this relation OID, if we don't have one
+	 * already.
+	 */
+	hentry = (NotnullHashEntry *) hash_search(root->glob->rel_notnullatts_hash,
+											  &relid,
+											  HASH_ENTER,
+											  &found);
+
+	/* bail out if a hash entry already exists for this relation OID */
+	if (found)
+		return;
+
+	/* collect the column not-null constraint information for this relation */
+	for (int i = 0; i < relation->rd_att->natts; i++)
+	{
+		CompactAttribute *attr = TupleDescCompactAttr(relation->rd_att, i);
+
+		Assert(attr->attnullability != ATTNULLABLE_UNKNOWN);
+
+		if (attr->attnullability == ATTNULLABLE_VALID)
+		{
+			notnullattnums = bms_add_member(notnullattnums, i + 1);
+
+			/*
+			 * Per RemoveAttributeById(), dropped columns will have their
+			 * attnotnull unset, so we needn't check for dropped columns in
+			 * the above condition.
+			 */
+			Assert(!attr->attisdropped);
+		}
+	}
+
+	/* ... and initialize the new hash entry */
+	hentry->notnullattnums = notnullattnums;
+}
+
+/*
+ * find_relation_notnullatts -
+ *	  Searches the hash table and returns the column not-null constraint
+ *	  information for a given relation.
+ */
+Relids
+find_relation_notnullatts(PlannerInfo *root, Oid relid)
+{
+	NotnullHashEntry *hentry;
+	bool		found;
+
+	if (root->glob->rel_notnullatts_hash == NULL)
+		return NULL;
+
+	hentry = (NotnullHashEntry *) hash_search(root->glob->rel_notnullatts_hash,
+											  &relid,
+											  HASH_FIND,
+											  &found);
+	if (!found)
+		return NULL;
+
+	return hentry->notnullattnums;
+}
+
+/*
  * infer_arbiter_indexes -
  *	  Determine the unique indexes used to arbitrate speculative insertion.
  *
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 73345bb3c70..db43034b9db 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -318,6 +318,11 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 %type <list>		opt_qualified_name
 %type <boolean>		opt_concurrently
 %type <dbehavior>	opt_drop_behavior
+%type <list>		opt_utility_option_list
+%type <list>		utility_option_list
+%type <defelt>		utility_option_elem
+%type <str>			utility_option_name
+%type <node>		utility_option_arg
 
 %type <node>	alter_column_default opclass_item opclass_drop alter_using
 %type <ival>	add_drop opt_asc_desc opt_nulls_order
@@ -338,10 +343,6 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 				create_extension_opt_item alter_extension_opt_item
 
 %type <ival>	opt_lock lock_type cast_context
-%type <str>		utility_option_name
-%type <defelt>	utility_option_elem
-%type <list>	utility_option_list
-%type <node>	utility_option_arg
 %type <defelt>	drop_option
 %type <boolean>	opt_or_replace opt_no
 				opt_grant_grant_option
@@ -556,7 +557,6 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 %type <list>	generic_option_list alter_generic_option_list
 
 %type <ival>	reindex_target_relation reindex_target_all
-%type <list>	opt_reindex_option_list
 
 %type <node>	copy_generic_opt_arg copy_generic_opt_arg_list_item
 %type <defelt>	copy_generic_opt_elem
@@ -1141,6 +1141,41 @@ opt_drop_behavior:
 			| /* EMPTY */					{ $$ = DROP_RESTRICT; /* default */ }
 		;
 
+opt_utility_option_list:
+			'(' utility_option_list ')'		{ $$ = $2; }
+			| /* EMPTY */					{ $$ = NULL; }
+		;
+
+utility_option_list:
+			utility_option_elem
+				{
+					$$ = list_make1($1);
+				}
+			| utility_option_list ',' utility_option_elem
+				{
+					$$ = lappend($1, $3);
+				}
+		;
+
+utility_option_elem:
+			utility_option_name utility_option_arg
+				{
+					$$ = makeDefElem($1, $2, @1);
+				}
+		;
+
+utility_option_name:
+			NonReservedWord					{ $$ = $1; }
+			| analyze_keyword				{ $$ = "analyze"; }
+			| FORMAT_LA						{ $$ = "format"; }
+		;
+
+utility_option_arg:
+			opt_boolean_or_string			{ $$ = (Node *) makeString($1); }
+			| NumericOnly					{ $$ = (Node *) $1; }
+			| /* EMPTY */					{ $$ = NULL; }
+		;
+
 /*****************************************************************************
  *
  * CALL statement
@@ -2028,18 +2063,12 @@ constraints_set_mode:
  * Checkpoint statement
  */
 CheckPointStmt:
-			CHECKPOINT
+			CHECKPOINT opt_utility_option_list
 				{
 					CheckPointStmt *n = makeNode(CheckPointStmt);
 
 					$$ = (Node *) n;
-				}
-			| CHECKPOINT '(' utility_option_list ')'
-				{
-					CheckPointStmt *n = makeNode(CheckPointStmt);
-
-					$$ = (Node *) n;
-					n->options = $3;
+					n->options = $2;
 				}
 		;
 
@@ -9354,7 +9383,7 @@ DropTransformStmt: DROP TRANSFORM opt_if_exists FOR Typename LANGUAGE name opt_d
  *****************************************************************************/
 
 ReindexStmt:
-			REINDEX opt_reindex_option_list reindex_target_relation opt_concurrently qualified_name
+			REINDEX opt_utility_option_list reindex_target_relation opt_concurrently qualified_name
 				{
 					ReindexStmt *n = makeNode(ReindexStmt);
 
@@ -9367,7 +9396,7 @@ ReindexStmt:
 											makeDefElem("concurrently", NULL, @4));
 					$$ = (Node *) n;
 				}
-			| REINDEX opt_reindex_option_list SCHEMA opt_concurrently name
+			| REINDEX opt_utility_option_list SCHEMA opt_concurrently name
 				{
 					ReindexStmt *n = makeNode(ReindexStmt);
 
@@ -9380,7 +9409,7 @@ ReindexStmt:
 											makeDefElem("concurrently", NULL, @4));
 					$$ = (Node *) n;
 				}
-			| REINDEX opt_reindex_option_list reindex_target_all opt_concurrently opt_single_name
+			| REINDEX opt_utility_option_list reindex_target_all opt_concurrently opt_single_name
 				{
 					ReindexStmt *n = makeNode(ReindexStmt);
 
@@ -9402,10 +9431,6 @@ reindex_target_all:
 			SYSTEM_P				{ $$ = REINDEX_OBJECT_SYSTEM; }
 			| DATABASE				{ $$ = REINDEX_OBJECT_DATABASE; }
 		;
-opt_reindex_option_list:
-			'(' utility_option_list ')'				{ $$ = $2; }
-			| /* EMPTY */							{ $$ = NULL; }
-		;
 
 /*****************************************************************************
  *
@@ -11903,13 +11928,13 @@ ClusterStmt:
 					n->params = $3;
 					$$ = (Node *) n;
 				}
-			| CLUSTER '(' utility_option_list ')'
+			| CLUSTER opt_utility_option_list
 				{
 					ClusterStmt *n = makeNode(ClusterStmt);
 
 					n->relation = NULL;
 					n->indexname = NULL;
-					n->params = $3;
+					n->params = $2;
 					$$ = (Node *) n;
 				}
 			/* unparenthesized VERBOSE kept for pre-14 compatibility */
@@ -11919,21 +11944,18 @@ ClusterStmt:
 
 					n->relation = $3;
 					n->indexname = $4;
-					n->params = NIL;
 					if ($2)
-						n->params = lappend(n->params, makeDefElem("verbose", NULL, @2));
+						n->params = list_make1(makeDefElem("verbose", NULL, @2));
 					$$ = (Node *) n;
 				}
 			/* unparenthesized VERBOSE kept for pre-17 compatibility */
-			| CLUSTER opt_verbose
+			| CLUSTER VERBOSE
 				{
 					ClusterStmt *n = makeNode(ClusterStmt);
 
 					n->relation = NULL;
 					n->indexname = NULL;
-					n->params = NIL;
-					if ($2)
-						n->params = lappend(n->params, makeDefElem("verbose", NULL, @2));
+					n->params = list_make1(makeDefElem("verbose", NULL, @2));
 					$$ = (Node *) n;
 				}
 			/* kept for pre-8.3 compatibility */
@@ -11943,9 +11965,8 @@ ClusterStmt:
 
 					n->relation = $5;
 					n->indexname = $3;
-					n->params = NIL;
 					if ($2)
-						n->params = lappend(n->params, makeDefElem("verbose", NULL, @2));
+						n->params = list_make1(makeDefElem("verbose", NULL, @2));
 					$$ = (Node *) n;
 				}
 		;
@@ -11996,64 +12017,31 @@ VacuumStmt: VACUUM opt_full opt_freeze opt_verbose opt_analyze opt_vacuum_relati
 				}
 		;
 
-AnalyzeStmt: analyze_keyword opt_verbose opt_vacuum_relation_list
+AnalyzeStmt: analyze_keyword opt_utility_option_list opt_vacuum_relation_list
 				{
 					VacuumStmt *n = makeNode(VacuumStmt);
 
-					n->options = NIL;
-					if ($2)
-						n->options = lappend(n->options,
-											 makeDefElem("verbose", NULL, @2));
+					n->options = $2;
 					n->rels = $3;
 					n->is_vacuumcmd = false;
 					$$ = (Node *) n;
 				}
-			| analyze_keyword '(' utility_option_list ')' opt_vacuum_relation_list
+			| analyze_keyword VERBOSE opt_vacuum_relation_list
 				{
 					VacuumStmt *n = makeNode(VacuumStmt);
 
-					n->options = $3;
-					n->rels = $5;
+					n->options = list_make1(makeDefElem("verbose", NULL, @2));
+					n->rels = $3;
 					n->is_vacuumcmd = false;
 					$$ = (Node *) n;
 				}
 		;
 
-utility_option_list:
-			utility_option_elem
-				{
-					$$ = list_make1($1);
-				}
-			| utility_option_list ',' utility_option_elem
-				{
-					$$ = lappend($1, $3);
-				}
-		;
-
 analyze_keyword:
 			ANALYZE
 			| ANALYSE /* British */
 		;
 
-utility_option_elem:
-			utility_option_name utility_option_arg
-				{
-					$$ = makeDefElem($1, $2, @1);
-				}
-		;
-
-utility_option_name:
-			NonReservedWord							{ $$ = $1; }
-			| analyze_keyword						{ $$ = "analyze"; }
-			| FORMAT_LA								{ $$ = "format"; }
-		;
-
-utility_option_arg:
-			opt_boolean_or_string					{ $$ = (Node *) makeString($1); }
-			| NumericOnly							{ $$ = (Node *) $1; }
-			| /* EMPTY */							{ $$ = NULL; }
-		;
-
 opt_analyze:
 			analyze_keyword							{ $$ = true; }
 			| /*EMPTY*/								{ $$ = false; }
diff --git a/src/backend/partitioning/partbounds.c b/src/backend/partitioning/partbounds.c
index 4bdc2941efb..822cf4ec451 100644
--- a/src/backend/partitioning/partbounds.c
+++ b/src/backend/partitioning/partbounds.c
@@ -1007,9 +1007,6 @@ partition_bounds_copy(PartitionBoundInfo src,
 	int			ndatums;
 	int			nindexes;
 	int			partnatts;
-	bool		hash_part;
-	int			natts;
-	Datum	   *boundDatums;
 
 	dest = (PartitionBoundInfo) palloc(sizeof(PartitionBoundInfoData));
 
@@ -1023,7 +1020,7 @@ partition_bounds_copy(PartitionBoundInfo src,
 
 	dest->datums = (Datum **) palloc(sizeof(Datum *) * ndatums);
 
-	if (src->kind != NULL)
+	if (src->kind != NULL && ndatums > 0)
 	{
 		PartitionRangeDatumKind *boundKinds;
 
@@ -1058,36 +1055,40 @@ partition_bounds_copy(PartitionBoundInfo src,
 	 * For hash partitioning, datums array will have two elements - modulus
 	 * and remainder.
 	 */
-	hash_part = (key->strategy == PARTITION_STRATEGY_HASH);
-	natts = hash_part ? 2 : partnatts;
-	boundDatums = palloc(ndatums * natts * sizeof(Datum));
-
-	for (i = 0; i < ndatums; i++)
+	if (ndatums > 0)
 	{
-		int			j;
-
-		dest->datums[i] = &boundDatums[i * natts];
+		bool		hash_part = (key->strategy == PARTITION_STRATEGY_HASH);
+		int			natts = hash_part ? 2 : partnatts;
+		Datum	   *boundDatums = palloc(ndatums * natts * sizeof(Datum));
 
-		for (j = 0; j < natts; j++)
+		for (i = 0; i < ndatums; i++)
 		{
-			bool		byval;
-			int			typlen;
+			int			j;
 
-			if (hash_part)
-			{
-				typlen = sizeof(int32); /* Always int4 */
-				byval = true;	/* int4 is pass-by-value */
-			}
-			else
+			dest->datums[i] = &boundDatums[i * natts];
+
+			for (j = 0; j < natts; j++)
 			{
-				byval = key->parttypbyval[j];
-				typlen = key->parttyplen[j];
-			}
+				if (dest->kind == NULL ||
+					dest->kind[i][j] == PARTITION_RANGE_DATUM_VALUE)
+				{
+					bool		byval;
+					int			typlen;
 
-			if (dest->kind == NULL ||
-				dest->kind[i][j] == PARTITION_RANGE_DATUM_VALUE)
-				dest->datums[i][j] = datumCopy(src->datums[i][j],
-											   byval, typlen);
+					if (hash_part)
+					{
+						typlen = sizeof(int32); /* Always int4 */
+						byval = true;	/* int4 is pass-by-value */
+					}
+					else
+					{
+						byval = key->parttypbyval[j];
+						typlen = key->parttyplen[j];
+					}
+					dest->datums[i][j] = datumCopy(src->datums[i][j],
+												   byval, typlen);
+				}
+			}
 		}
 	}
 
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index 9474095f271..ff96b36d710 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -310,6 +310,16 @@ static AutoVacuumShmemStruct *AutoVacuumShmem;
 static dlist_head DatabaseList = DLIST_STATIC_INIT(DatabaseList);
 static MemoryContext DatabaseListCxt = NULL;
 
+/*
+ * Dummy pointer to persuade Valgrind that we've not leaked the array of
+ * avl_dbase structs.  Make it global to ensure the compiler doesn't
+ * optimize it away.
+ */
+#ifdef USE_VALGRIND
+extern avl_dbase *avl_dbase_array;
+avl_dbase  *avl_dbase_array;
+#endif
+
 /* Pointer to my own WorkerInfo, valid on each worker */
 static WorkerInfo MyWorkerInfo = NULL;
 
@@ -562,10 +572,10 @@ AutoVacLauncherMain(const void *startup_data, size_t startup_data_len)
 
 	/*
 	 * Create the initial database list.  The invariant we want this list to
-	 * keep is that it's ordered by decreasing next_time.  As soon as an entry
-	 * is updated to a higher time, it will be moved to the front (which is
-	 * correct because the only operation is to add autovacuum_naptime to the
-	 * entry, and time always increases).
+	 * keep is that it's ordered by decreasing next_worker.  As soon as an
+	 * entry is updated to a higher time, it will be moved to the front (which
+	 * is correct because the only operation is to add autovacuum_naptime to
+	 * the entry, and time always increases).
 	 */
 	rebuild_database_list(InvalidOid);
 
@@ -1020,6 +1030,10 @@ rebuild_database_list(Oid newdb)
 
 		/* put all the hash elements into an array */
 		dbary = palloc(nelems * sizeof(avl_dbase));
+		/* keep Valgrind quiet */
+#ifdef USE_VALGRIND
+		avl_dbase_array = dbary;
+#endif
 
 		i = 0;
 		hash_seq_init(&seq, dbhash);
@@ -2565,8 +2579,18 @@ deleted:
 
 	/*
 	 * We leak table_toast_map here (among other things), but since we're
-	 * going away soon, it's not a problem.
+	 * going away soon, it's not a problem normally.  But when using Valgrind,
+	 * release some stuff to reduce complaints about leaked storage.
 	 */
+#ifdef USE_VALGRIND
+	hash_destroy(table_toast_map);
+	FreeTupleDesc(pg_class_desc);
+	if (bstrategy)
+		pfree(bstrategy);
+#endif
+
+	/* Run the rest in xact context, mainly to avoid Valgrind leak warnings */
+	MemoryContextSwitchTo(TopTransactionContext);
 
 	/*
 	 * Update pg_database.datfrozenxid, and truncate pg_xact if possible. We
diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c
index 116ddf7b835..1ad65c237c3 100644
--- a/src/backend/postmaster/bgworker.c
+++ b/src/backend/postmaster/bgworker.c
@@ -613,6 +613,7 @@ ResetBackgroundWorkerCrashTimes(void)
 			 * resetting.
 			 */
 			rw->rw_crashed_at = 0;
+			rw->rw_pid = 0;
 
 			/*
 			 * If there was anyone waiting for it, they're history.
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index 2809e298a44..8490148a47d 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -130,6 +130,13 @@ typedef struct
 
 	int			num_requests;	/* current # of requests */
 	int			max_requests;	/* allocated array size */
+
+	int			head;			/* Index of the first request in the ring
+								 * buffer */
+	int			tail;			/* Index of the last request in the ring
+								 * buffer */
+
+	/* The ring buffer of pending checkpointer requests */
 	CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER];
 } CheckpointerShmemStruct;
 
@@ -138,6 +145,12 @@ static CheckpointerShmemStruct *CheckpointerShmem;
 /* interval for calling AbsorbSyncRequests in CheckpointWriteDelay */
 #define WRITES_PER_ABSORB		1000
 
+/* Maximum number of checkpointer requests to process in one batch */
+#define CKPT_REQ_BATCH_SIZE 10000
+
+/* Max number of requests the checkpointer request queue can hold */
+#define MAX_CHECKPOINT_REQUESTS 10000000
+
 /*
  * GUC parameters
  */
@@ -973,7 +986,8 @@ CheckpointerShmemInit(void)
 		 */
 		MemSet(CheckpointerShmem, 0, size);
 		SpinLockInit(&CheckpointerShmem->ckpt_lck);
-		CheckpointerShmem->max_requests = NBuffers;
+		CheckpointerShmem->max_requests = Min(NBuffers, MAX_CHECKPOINT_REQUESTS);
+		CheckpointerShmem->head = CheckpointerShmem->tail = 0;
 		ConditionVariableInit(&CheckpointerShmem->start_cv);
 		ConditionVariableInit(&CheckpointerShmem->done_cv);
 	}
@@ -1201,6 +1215,7 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
 {
 	CheckpointerRequest *request;
 	bool		too_full;
+	int			insert_pos;
 
 	if (!IsUnderPostmaster)
 		return false;			/* probably shouldn't even get here */
@@ -1224,10 +1239,14 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
 	}
 
 	/* OK, insert request */
-	request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++];
+	insert_pos = CheckpointerShmem->tail;
+	request = &CheckpointerShmem->requests[insert_pos];
 	request->ftag = *ftag;
 	request->type = type;
 
+	CheckpointerShmem->tail = (CheckpointerShmem->tail + 1) % CheckpointerShmem->max_requests;
+	CheckpointerShmem->num_requests++;
+
 	/* If queue is more than half full, nudge the checkpointer to empty it */
 	too_full = (CheckpointerShmem->num_requests >=
 				CheckpointerShmem->max_requests / 2);
@@ -1269,12 +1288,16 @@ CompactCheckpointerRequestQueue(void)
 	struct CheckpointerSlotMapping
 	{
 		CheckpointerRequest request;
-		int			slot;
+		int			ring_idx;
 	};
 
-	int			n,
-				preserve_count;
+	int			n;
 	int			num_skipped = 0;
+	int			head;
+	int			max_requests;
+	int			num_requests;
+	int			read_idx,
+				write_idx;
 	HASHCTL		ctl;
 	HTAB	   *htab;
 	bool	   *skip_slot;
@@ -1286,8 +1309,13 @@ CompactCheckpointerRequestQueue(void)
 	if (CritSectionCount > 0)
 		return false;
 
+	max_requests = CheckpointerShmem->max_requests;
+	num_requests = CheckpointerShmem->num_requests;
+
 	/* Initialize skip_slot array */
-	skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests);
+	skip_slot = palloc0(sizeof(bool) * max_requests);
+
+	head = CheckpointerShmem->head;
 
 	/* Initialize temporary hash table */
 	ctl.keysize = sizeof(CheckpointerRequest);
@@ -1311,7 +1339,8 @@ CompactCheckpointerRequestQueue(void)
 	 * away preceding entries that would end up being canceled anyhow), but
 	 * it's not clear that the extra complexity would buy us anything.
 	 */
-	for (n = 0; n < CheckpointerShmem->num_requests; n++)
+	read_idx = head;
+	for (n = 0; n < num_requests; n++)
 	{
 		CheckpointerRequest *request;
 		struct CheckpointerSlotMapping *slotmap;
@@ -1324,16 +1353,19 @@ CompactCheckpointerRequestQueue(void)
 		 * CheckpointerShmemInit.  Note also that RelFileLocator had better
 		 * contain no pad bytes.
 		 */
-		request = &CheckpointerShmem->requests[n];
+		request = &CheckpointerShmem->requests[read_idx];
 		slotmap = hash_search(htab, request, HASH_ENTER, &found);
 		if (found)
 		{
 			/* Duplicate, so mark the previous occurrence as skippable */
-			skip_slot[slotmap->slot] = true;
+			skip_slot[slotmap->ring_idx] = true;
 			num_skipped++;
 		}
 		/* Remember slot containing latest occurrence of this request value */
-		slotmap->slot = n;
+		slotmap->ring_idx = read_idx;
+
+		/* Move to the next request in the ring buffer */
+		read_idx = (read_idx + 1) % max_requests;
 	}
 
 	/* Done with the hash table. */
@@ -1347,17 +1379,34 @@ CompactCheckpointerRequestQueue(void)
 	}
 
 	/* We found some duplicates; remove them. */
-	preserve_count = 0;
-	for (n = 0; n < CheckpointerShmem->num_requests; n++)
+	read_idx = write_idx = head;
+	for (n = 0; n < num_requests; n++)
 	{
-		if (skip_slot[n])
-			continue;
-		CheckpointerShmem->requests[preserve_count++] = CheckpointerShmem->requests[n];
+		/* If this slot is NOT skipped, keep it */
+		if (!skip_slot[read_idx])
+		{
+			/* If the read and write positions are different, copy the request */
+			if (write_idx != read_idx)
+				CheckpointerShmem->requests[write_idx] =
+					CheckpointerShmem->requests[read_idx];
+
+			/* Advance the write position */
+			write_idx = (write_idx + 1) % max_requests;
+		}
+
+		read_idx = (read_idx + 1) % max_requests;
 	}
+
+	/*
+	 * Update ring buffer state: head remains the same, tail moves, count
+	 * decreases
+	 */
+	CheckpointerShmem->tail = write_idx;
+	CheckpointerShmem->num_requests -= num_skipped;
+
 	ereport(DEBUG1,
 			(errmsg_internal("compacted fsync request queue from %d entries to %d entries",
-							 CheckpointerShmem->num_requests, preserve_count)));
-	CheckpointerShmem->num_requests = preserve_count;
+							 num_requests, CheckpointerShmem->num_requests)));
 
 	/* Cleanup. */
 	pfree(skip_slot);
@@ -1378,40 +1427,64 @@ AbsorbSyncRequests(void)
 {
 	CheckpointerRequest *requests = NULL;
 	CheckpointerRequest *request;
-	int			n;
+	int			n,
+				i;
+	bool		loop;
 
 	if (!AmCheckpointerProcess())
 		return;
 
-	LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
-
-	/*
-	 * We try to avoid holding the lock for a long time by copying the request
-	 * array, and processing the requests after releasing the lock.
-	 *
-	 * Once we have cleared the requests from shared memory, we have to PANIC
-	 * if we then fail to absorb them (eg, because our hashtable runs out of
-	 * memory).  This is because the system cannot run safely if we are unable
-	 * to fsync what we have been told to fsync.  Fortunately, the hashtable
-	 * is so small that the problem is quite unlikely to arise in practice.
-	 */
-	n = CheckpointerShmem->num_requests;
-	if (n > 0)
+	do
 	{
-		requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest));
-		memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest));
-	}
+		LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
+
+		/*---
+		 * We try to avoid holding the lock for a long time by:
+		 * 1. Copying the request array and processing the requests after
+		 *    releasing the lock;
+		 * 2. Processing not the whole queue, but only batches of
+		 *    CKPT_REQ_BATCH_SIZE at once.
+		 *
+		 * Once we have cleared the requests from shared memory, we must
+		 * PANIC if we then fail to absorb them (e.g., because our hashtable
+		 * runs out of memory).  This is because the system cannot run safely
+		 * if we are unable to fsync what we have been told to fsync.
+		 * Fortunately, the hashtable is so small that the problem is quite
+		 * unlikely to arise in practice.
+		 *
+		 * Note: The maximum possible size of a ring buffer is
+		 * MAX_CHECKPOINT_REQUESTS entries, which fit into a maximum palloc
+		 * allocation size of 1Gb.  Our maximum batch size,
+		 * CKPT_REQ_BATCH_SIZE, is even smaller.
+		 */
+		n = Min(CheckpointerShmem->num_requests, CKPT_REQ_BATCH_SIZE);
+		if (n > 0)
+		{
+			if (!requests)
+				requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest));
 
-	START_CRIT_SECTION();
+			for (i = 0; i < n; i++)
+			{
+				requests[i] = CheckpointerShmem->requests[CheckpointerShmem->head];
+				CheckpointerShmem->head = (CheckpointerShmem->head + 1) % CheckpointerShmem->max_requests;
+			}
 
-	CheckpointerShmem->num_requests = 0;
+			CheckpointerShmem->num_requests -= n;
 
-	LWLockRelease(CheckpointerCommLock);
+		}
+
+		START_CRIT_SECTION();
+
+		/* Are there any requests in the queue? If so, keep going. */
+		loop = CheckpointerShmem->num_requests != 0;
+
+		LWLockRelease(CheckpointerCommLock);
 
-	for (request = requests; n > 0; request++, n--)
-		RememberSyncRequest(&request->ftag, request->type);
+		for (request = requests; n > 0; request++, n--)
+			RememberSyncRequest(&request->ftag, request->type);
 
-	END_CRIT_SECTION();
+		END_CRIT_SECTION();
+	} while (loop);
 
 	if (requests)
 		pfree(requests);
diff --git a/src/backend/postmaster/pmchild.c b/src/backend/postmaster/pmchild.c
index cde1d23a4ca..584bb58c8ab 100644
--- a/src/backend/postmaster/pmchild.c
+++ b/src/backend/postmaster/pmchild.c
@@ -60,6 +60,17 @@ NON_EXEC_STATIC int num_pmchild_slots = 0;
 dlist_head	ActiveChildList;
 
 /*
+ * Dummy pointer to persuade Valgrind that we've not leaked the array of
+ * PMChild structs.  Make it global to ensure the compiler doesn't
+ * optimize it away.
+ */
+#ifdef USE_VALGRIND
+extern PMChild *pmchild_array;
+PMChild    *pmchild_array;
+#endif
+
+
+/*
  * MaxLivePostmasterChildren
  *
  * This reports the number of postmaster child processes that can be active.
@@ -125,8 +136,13 @@ InitPostmasterChildSlots(void)
 	for (int i = 0; i < BACKEND_NUM_TYPES; i++)
 		num_pmchild_slots += pmchild_pools[i].size;
 
-	/* Initialize them */
+	/* Allocate enough slots, and make sure Valgrind doesn't complain */
 	slots = palloc(num_pmchild_slots * sizeof(PMChild));
+#ifdef USE_VALGRIND
+	pmchild_array = slots;
+#endif
+
+	/* Initialize them */
 	slotno = 0;
 	for (int btype = 0; btype < BACKEND_NUM_TYPES; btype++)
 	{
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index cca9b946e53..e01d9f0cfe8 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -2630,6 +2630,13 @@ CleanupBackend(PMChild *bp,
 	}
 	bp = NULL;
 
+	/*
+	 * In a crash case, exit immediately without resetting background worker
+	 * state. However, if restart_after_crash is enabled, the background
+	 * worker state (e.g., rw_pid) still needs be reset so the worker can
+	 * restart after crash recovery. This reset is handled in
+	 * ResetBackgroundWorkerCrashTimes(), not here.
+	 */
 	if (crashed)
 	{
 		HandleChildCrash(bp_pid, exitstatus, procname);
diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
index f7b5d093681..239641bfbb6 100644
--- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
+++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
@@ -232,6 +232,9 @@ libpqrcv_connect(const char *conninfo, bool replication, bool logical,
 				 errhint("Target server's authentication method must be changed, or set password_required=false in the subscription parameters.")));
 	}
 
+	PQsetNoticeReceiver(conn->streamConn, libpqsrv_notice_receiver,
+						"received message via replication");
+
 	/*
 	 * Set always-secure search path for the cases where the connection is
 	 * used to run SQL queries, so malicious users can't get control.
@@ -418,31 +421,22 @@ libpqrcv_identify_system(WalReceiverConn *conn, TimeLineID *primary_tli)
 						"IDENTIFY_SYSTEM",
 						WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE);
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
-	{
-		PQclear(res);
 		ereport(ERROR,
 				(errcode(ERRCODE_PROTOCOL_VIOLATION),
 				 errmsg("could not receive database system identifier and timeline ID from "
 						"the primary server: %s",
 						pchomp(PQerrorMessage(conn->streamConn)))));
-	}
 
 	/*
 	 * IDENTIFY_SYSTEM returns 3 columns in 9.3 and earlier, and 4 columns in
 	 * 9.4 and onwards.
 	 */
 	if (PQnfields(res) < 3 || PQntuples(res) != 1)
-	{
-		int			ntuples = PQntuples(res);
-		int			nfields = PQnfields(res);
-
-		PQclear(res);
 		ereport(ERROR,
 				(errcode(ERRCODE_PROTOCOL_VIOLATION),
 				 errmsg("invalid response from primary server"),
 				 errdetail("Could not identify system: got %d rows and %d fields, expected %d rows and %d or more fields.",
-						   ntuples, nfields, 1, 3)));
-	}
+						   PQntuples(res), PQnfields(res), 1, 3)));
 	primary_sysid = pstrdup(PQgetvalue(res, 0, 0));
 	*primary_tli = pg_strtoint32(PQgetvalue(res, 0, 1));
 	PQclear(res);
@@ -604,13 +598,10 @@ libpqrcv_startstreaming(WalReceiverConn *conn,
 		return false;
 	}
 	else if (PQresultStatus(res) != PGRES_COPY_BOTH)
-	{
-		PQclear(res);
 		ereport(ERROR,
 				(errcode(ERRCODE_PROTOCOL_VIOLATION),
 				 errmsg("could not start WAL streaming: %s",
 						pchomp(PQerrorMessage(conn->streamConn)))));
-	}
 	PQclear(res);
 	return true;
 }
@@ -718,26 +709,17 @@ libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn,
 						cmd,
 						WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE);
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
-	{
-		PQclear(res);
 		ereport(ERROR,
 				(errcode(ERRCODE_PROTOCOL_VIOLATION),
 				 errmsg("could not receive timeline history file from "
 						"the primary server: %s",
 						pchomp(PQerrorMessage(conn->streamConn)))));
-	}
 	if (PQnfields(res) != 2 || PQntuples(res) != 1)
-	{
-		int			ntuples = PQntuples(res);
-		int			nfields = PQnfields(res);
-
-		PQclear(res);
 		ereport(ERROR,
 				(errcode(ERRCODE_PROTOCOL_VIOLATION),
 				 errmsg("invalid response from primary server"),
 				 errdetail("Expected 1 tuple with 2 fields, got %d tuples with %d fields.",
-						   ntuples, nfields)));
-	}
+						   PQntuples(res), PQnfields(res))));
 	*filename = pstrdup(PQgetvalue(res, 0, 0));
 
 	*len = PQgetlength(res, 0, 1);
@@ -841,13 +823,10 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer,
 			return -1;
 		}
 		else
-		{
-			PQclear(res);
 			ereport(ERROR,
 					(errcode(ERRCODE_PROTOCOL_VIOLATION),
 					 errmsg("could not receive data from WAL stream: %s",
 							pchomp(PQerrorMessage(conn->streamConn)))));
-		}
 	}
 	if (rawlen < -1)
 		ereport(ERROR,
@@ -971,13 +950,10 @@ libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname,
 	pfree(cmd.data);
 
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
-	{
-		PQclear(res);
 		ereport(ERROR,
 				(errcode(ERRCODE_PROTOCOL_VIOLATION),
 				 errmsg("could not create replication slot \"%s\": %s",
 						slotname, pchomp(PQerrorMessage(conn->streamConn)))));
-	}
 
 	if (lsn)
 		*lsn = DatumGetLSN(DirectFunctionCall1Coll(pg_lsn_in, InvalidOid,
diff --git a/src/backend/replication/logical/applyparallelworker.c b/src/backend/replication/logical/applyparallelworker.c
index d25085d3515..1fa931a7422 100644
--- a/src/backend/replication/logical/applyparallelworker.c
+++ b/src/backend/replication/logical/applyparallelworker.c
@@ -441,7 +441,8 @@ pa_launch_parallel_worker(void)
 										MySubscription->name,
 										MyLogicalRepWorker->userid,
 										InvalidOid,
-										dsm_segment_handle(winfo->dsm_seg));
+										dsm_segment_handle(winfo->dsm_seg),
+										false);
 
 	if (launched)
 	{
diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c
index 4aed0dfcebb..37377f7eb63 100644
--- a/src/backend/replication/logical/launcher.c
+++ b/src/backend/replication/logical/launcher.c
@@ -32,6 +32,7 @@
 #include "postmaster/interrupt.h"
 #include "replication/logicallauncher.h"
 #include "replication/origin.h"
+#include "replication/slot.h"
 #include "replication/walreceiver.h"
 #include "replication/worker_internal.h"
 #include "storage/ipc.h"
@@ -91,7 +92,6 @@ static dshash_table *last_start_times = NULL;
 static bool on_commit_launcher_wakeup = false;
 
 
-static void ApplyLauncherWakeup(void);
 static void logicalrep_launcher_onexit(int code, Datum arg);
 static void logicalrep_worker_onexit(int code, Datum arg);
 static void logicalrep_worker_detach(void);
@@ -100,6 +100,9 @@ static int	logicalrep_pa_worker_count(Oid subid);
 static void logicalrep_launcher_attach_dshmem(void);
 static void ApplyLauncherSetWorkerStartTime(Oid subid, TimestampTz start_time);
 static TimestampTz ApplyLauncherGetWorkerStartTime(Oid subid);
+static void compute_min_nonremovable_xid(LogicalRepWorker *worker, TransactionId *xmin);
+static bool acquire_conflict_slot_if_exists(void);
+static void advance_conflict_slot_xmin(TransactionId new_xmin);
 
 
 /*
@@ -148,6 +151,7 @@ get_subscription_list(void)
 		sub->owner = subform->subowner;
 		sub->enabled = subform->subenabled;
 		sub->name = pstrdup(NameStr(subform->subname));
+		sub->retaindeadtuples = subform->subretaindeadtuples;
 		/* We don't fill fields we are not interested in. */
 
 		res = lappend(res, sub);
@@ -309,7 +313,8 @@ logicalrep_workers_find(Oid subid, bool only_running, bool acquire_lock)
 bool
 logicalrep_worker_launch(LogicalRepWorkerType wtype,
 						 Oid dbid, Oid subid, const char *subname, Oid userid,
-						 Oid relid, dsm_handle subworker_dsm)
+						 Oid relid, dsm_handle subworker_dsm,
+						 bool retain_dead_tuples)
 {
 	BackgroundWorker bgw;
 	BackgroundWorkerHandle *bgw_handle;
@@ -328,10 +333,13 @@ logicalrep_worker_launch(LogicalRepWorkerType wtype,
 	 * - must be valid worker type
 	 * - tablesync workers are only ones to have relid
 	 * - parallel apply worker is the only kind of subworker
+	 * - The replication slot used in conflict detection is created when
+	 *   retain_dead_tuples is enabled
 	 */
 	Assert(wtype != WORKERTYPE_UNKNOWN);
 	Assert(is_tablesync_worker == OidIsValid(relid));
 	Assert(is_parallel_apply_worker == (subworker_dsm != DSM_HANDLE_INVALID));
+	Assert(!retain_dead_tuples || MyReplicationSlot);
 
 	ereport(DEBUG1,
 			(errmsg_internal("starting logical replication worker for subscription \"%s\"",
@@ -454,6 +462,9 @@ retry:
 	worker->stream_fileset = NULL;
 	worker->leader_pid = is_parallel_apply_worker ? MyProcPid : InvalidPid;
 	worker->parallel_apply = is_parallel_apply_worker;
+	worker->oldest_nonremovable_xid = retain_dead_tuples
+		? MyReplicationSlot->data.xmin
+		: InvalidTransactionId;
 	worker->last_lsn = InvalidXLogRecPtr;
 	TIMESTAMP_NOBEGIN(worker->last_send_time);
 	TIMESTAMP_NOBEGIN(worker->last_recv_time);
@@ -779,6 +790,8 @@ logicalrep_worker_detach(void)
 		}
 
 		LWLockRelease(LogicalRepWorkerLock);
+
+		list_free(workers);
 	}
 
 	/* Block concurrent access. */
@@ -1118,7 +1131,10 @@ ApplyLauncherWakeupAtCommit(void)
 		on_commit_launcher_wakeup = true;
 }
 
-static void
+/*
+ * Wakeup the launcher immediately.
+ */
+void
 ApplyLauncherWakeup(void)
 {
 	if (LogicalRepCtx->launcher_pid != 0)
@@ -1150,6 +1166,12 @@ ApplyLauncherMain(Datum main_arg)
 	 */
 	BackgroundWorkerInitializeConnection(NULL, NULL, 0);
 
+	/*
+	 * Acquire the conflict detection slot at startup to ensure it can be
+	 * dropped if no longer needed after a restart.
+	 */
+	acquire_conflict_slot_if_exists();
+
 	/* Enter main loop */
 	for (;;)
 	{
@@ -1159,6 +1181,9 @@ ApplyLauncherMain(Datum main_arg)
 		MemoryContext subctx;
 		MemoryContext oldctx;
 		long		wait_time = DEFAULT_NAPTIME_PER_CYCLE;
+		bool		can_advance_xmin = true;
+		bool		retain_dead_tuples = false;
+		TransactionId xmin = InvalidTransactionId;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -1168,7 +1193,14 @@ ApplyLauncherMain(Datum main_arg)
 									   ALLOCSET_DEFAULT_SIZES);
 		oldctx = MemoryContextSwitchTo(subctx);
 
-		/* Start any missing workers for enabled subscriptions. */
+		/*
+		 * Start any missing workers for enabled subscriptions.
+		 *
+		 * Also, during the iteration through all subscriptions, we compute
+		 * the minimum XID required to protect deleted tuples for conflict
+		 * detection if one of the subscription enables retain_dead_tuples
+		 * option.
+		 */
 		sublist = get_subscription_list();
 		foreach(lc, sublist)
 		{
@@ -1178,6 +1210,38 @@ ApplyLauncherMain(Datum main_arg)
 			TimestampTz now;
 			long		elapsed;
 
+			if (sub->retaindeadtuples)
+			{
+				retain_dead_tuples = true;
+
+				/*
+				 * Can't advance xmin of the slot unless all the subscriptions
+				 * with retain_dead_tuples are enabled. This is required to
+				 * ensure that we don't advance the xmin of
+				 * CONFLICT_DETECTION_SLOT if one of the subscriptions is not
+				 * enabled. Otherwise, we won't be able to detect conflicts
+				 * reliably for such a subscription even though it has set the
+				 * retain_dead_tuples option.
+				 */
+				can_advance_xmin &= sub->enabled;
+
+				/*
+				 * Create a replication slot to retain information necessary
+				 * for conflict detection such as dead tuples, commit
+				 * timestamps, and origins.
+				 *
+				 * The slot is created before starting the apply worker to
+				 * prevent it from unnecessarily maintaining its
+				 * oldest_nonremovable_xid.
+				 *
+				 * The slot is created even for a disabled subscription to
+				 * ensure that conflict-related information is available when
+				 * applying remote changes that occurred before the
+				 * subscription was enabled.
+				 */
+				CreateConflictDetectionSlot();
+			}
+
 			if (!sub->enabled)
 				continue;
 
@@ -1186,7 +1250,27 @@ ApplyLauncherMain(Datum main_arg)
 			LWLockRelease(LogicalRepWorkerLock);
 
 			if (w != NULL)
-				continue;		/* worker is running already */
+			{
+				/*
+				 * Compute the minimum xmin required to protect dead tuples
+				 * required for conflict detection among all running apply
+				 * workers that enables retain_dead_tuples.
+				 */
+				if (sub->retaindeadtuples && can_advance_xmin)
+					compute_min_nonremovable_xid(w, &xmin);
+
+				/* worker is running already */
+				continue;
+			}
+
+			/*
+			 * Can't advance xmin of the slot unless all the workers
+			 * corresponding to subscriptions with retain_dead_tuples are
+			 * running, disabling the further computation of the minimum
+			 * nonremovable xid.
+			 */
+			if (sub->retaindeadtuples)
+				can_advance_xmin = false;
 
 			/*
 			 * If the worker is eligible to start now, launch it.  Otherwise,
@@ -1210,7 +1294,8 @@ ApplyLauncherMain(Datum main_arg)
 				if (!logicalrep_worker_launch(WORKERTYPE_APPLY,
 											  sub->dbid, sub->oid, sub->name,
 											  sub->owner, InvalidOid,
-											  DSM_HANDLE_INVALID))
+											  DSM_HANDLE_INVALID,
+											  sub->retaindeadtuples))
 				{
 					/*
 					 * We get here either if we failed to launch a worker
@@ -1230,6 +1315,20 @@ ApplyLauncherMain(Datum main_arg)
 			}
 		}
 
+		/*
+		 * Drop the CONFLICT_DETECTION_SLOT slot if there is no subscription
+		 * that requires us to retain dead tuples. Otherwise, if required,
+		 * advance the slot's xmin to protect dead tuples required for the
+		 * conflict detection.
+		 */
+		if (MyReplicationSlot)
+		{
+			if (!retain_dead_tuples)
+				ReplicationSlotDropAcquired();
+			else if (can_advance_xmin)
+				advance_conflict_slot_xmin(xmin);
+		}
+
 		/* Switch back to original memory context. */
 		MemoryContextSwitchTo(oldctx);
 		/* Clean the temporary memory. */
@@ -1258,6 +1357,125 @@ ApplyLauncherMain(Datum main_arg)
 }
 
 /*
+ * Determine the minimum non-removable transaction ID across all apply workers
+ * for subscriptions that have retain_dead_tuples enabled. Store the result
+ * in *xmin.
+ */
+static void
+compute_min_nonremovable_xid(LogicalRepWorker *worker, TransactionId *xmin)
+{
+	TransactionId nonremovable_xid;
+
+	Assert(worker != NULL);
+
+	/*
+	 * The replication slot for conflict detection must be created before the
+	 * worker starts.
+	 */
+	Assert(MyReplicationSlot);
+
+	SpinLockAcquire(&worker->relmutex);
+	nonremovable_xid = worker->oldest_nonremovable_xid;
+	SpinLockRelease(&worker->relmutex);
+
+	Assert(TransactionIdIsValid(nonremovable_xid));
+
+	if (!TransactionIdIsValid(*xmin) ||
+		TransactionIdPrecedes(nonremovable_xid, *xmin))
+		*xmin = nonremovable_xid;
+}
+
+/*
+ * Acquire the replication slot used to retain information for conflict
+ * detection, if it exists.
+ *
+ * Return true if successfully acquired, otherwise return false.
+ */
+static bool
+acquire_conflict_slot_if_exists(void)
+{
+	if (!SearchNamedReplicationSlot(CONFLICT_DETECTION_SLOT, true))
+		return false;
+
+	ReplicationSlotAcquire(CONFLICT_DETECTION_SLOT, true, false);
+	return true;
+}
+
+/*
+ * Advance the xmin the replication slot used to retain information required
+ * for conflict detection.
+ */
+static void
+advance_conflict_slot_xmin(TransactionId new_xmin)
+{
+	Assert(MyReplicationSlot);
+	Assert(TransactionIdIsValid(new_xmin));
+	Assert(TransactionIdPrecedesOrEquals(MyReplicationSlot->data.xmin, new_xmin));
+
+	/* Return if the xmin value of the slot cannot be advanced */
+	if (TransactionIdEquals(MyReplicationSlot->data.xmin, new_xmin))
+		return;
+
+	SpinLockAcquire(&MyReplicationSlot->mutex);
+	MyReplicationSlot->effective_xmin = new_xmin;
+	MyReplicationSlot->data.xmin = new_xmin;
+	SpinLockRelease(&MyReplicationSlot->mutex);
+
+	elog(DEBUG1, "updated xmin: %u", MyReplicationSlot->data.xmin);
+
+	ReplicationSlotMarkDirty();
+	ReplicationSlotsComputeRequiredXmin(false);
+
+	/*
+	 * Like PhysicalConfirmReceivedLocation(), do not save slot information
+	 * each time. This is acceptable because all concurrent transactions on
+	 * the publisher that require the data preceding the slot's xmin should
+	 * have already been applied and flushed on the subscriber before the xmin
+	 * is advanced. So, even if the slot's xmin regresses after a restart, it
+	 * will be advanced again in the next cycle. Therefore, no data required
+	 * for conflict detection will be prematurely removed.
+	 */
+	return;
+}
+
+/*
+ * Create and acquire the replication slot used to retain information for
+ * conflict detection, if not yet.
+ */
+void
+CreateConflictDetectionSlot(void)
+{
+	TransactionId xmin_horizon;
+
+	/* Exit early, if the replication slot is already created and acquired */
+	if (MyReplicationSlot)
+		return;
+
+	ereport(LOG,
+			errmsg("creating replication conflict detection slot"));
+
+	ReplicationSlotCreate(CONFLICT_DETECTION_SLOT, false, RS_PERSISTENT, false,
+						  false, false);
+
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	xmin_horizon = GetOldestSafeDecodingTransactionId(false);
+
+	SpinLockAcquire(&MyReplicationSlot->mutex);
+	MyReplicationSlot->effective_xmin = xmin_horizon;
+	MyReplicationSlot->data.xmin = xmin_horizon;
+	SpinLockRelease(&MyReplicationSlot->mutex);
+
+	ReplicationSlotsComputeRequiredXmin(true);
+
+	LWLockRelease(ProcArrayLock);
+
+	/* Write this slot to disk */
+	ReplicationSlotMarkDirty();
+	ReplicationSlotSave();
+}
+
+/*
  * Is current process the logical replication launcher?
  */
 bool
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 7b4e8629553..34cf05668ae 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -2599,7 +2599,7 @@ ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
 
 			if (++changes_count >= CHANGES_THRESHOLD)
 			{
-				rb->update_progress_txn(rb, txn, change->lsn);
+				rb->update_progress_txn(rb, txn, prev_lsn);
 				changes_count = 0;
 			}
 		}
@@ -4917,7 +4917,7 @@ StartupReorderBuffer(void)
 			continue;
 
 		/* if it cannot be a slot, skip the directory */
-		if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2))
+		if (!ReplicationSlotValidateName(logical_de->d_name, true, DEBUG2))
 			continue;
 
 		/*
diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c
index e4fd6347fd1..d3356bc84ee 100644
--- a/src/backend/replication/logical/tablesync.c
+++ b/src/backend/replication/logical/tablesync.c
@@ -316,7 +316,8 @@ process_syncing_tables_for_sync(XLogRecPtr current_lsn)
 		UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
 								   MyLogicalRepWorker->relid,
 								   MyLogicalRepWorker->relstate,
-								   MyLogicalRepWorker->relstate_lsn);
+								   MyLogicalRepWorker->relstate_lsn,
+								   false);
 
 		/*
 		 * End streaming so that LogRepWorkerWalRcvConn can be used to drop
@@ -425,6 +426,7 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
 	ListCell   *lc;
 	bool		started_tx = false;
 	bool		should_exit = false;
+	Relation	rel = NULL;
 
 	Assert(!IsTransactionState());
 
@@ -492,7 +494,17 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
 				 * worker to remove the origin tracking as if there is any
 				 * error while dropping we won't restart it to drop the
 				 * origin. So passing missing_ok = true.
+				 *
+				 * Lock the subscription and origin in the same order as we
+				 * are doing during DDL commands to avoid deadlocks. See
+				 * AlterSubscription_refresh.
 				 */
+				LockSharedObject(SubscriptionRelationId, MyLogicalRepWorker->subid,
+								 0, AccessShareLock);
+
+				if (!rel)
+					rel = table_open(SubscriptionRelRelationId, RowExclusiveLock);
+
 				ReplicationOriginNameForLogicalRep(MyLogicalRepWorker->subid,
 												   rstate->relid,
 												   originname,
@@ -504,7 +516,7 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
 				 */
 				UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
 										   rstate->relid, rstate->state,
-										   rstate->lsn);
+										   rstate->lsn, true);
 			}
 		}
 		else
@@ -555,7 +567,14 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
 						 * This is required to avoid any undetected deadlocks
 						 * due to any existing lock as deadlock detector won't
 						 * be able to detect the waits on the latch.
+						 *
+						 * Also close any tables prior to the commit.
 						 */
+						if (rel)
+						{
+							table_close(rel, NoLock);
+							rel = NULL;
+						}
 						CommitTransactionCommand();
 						pgstat_report_stat(false);
 					}
@@ -615,13 +634,19 @@ process_syncing_tables_for_apply(XLogRecPtr current_lsn)
 														MySubscription->name,
 														MyLogicalRepWorker->userid,
 														rstate->relid,
-														DSM_HANDLE_INVALID);
+														DSM_HANDLE_INVALID,
+														false);
 					}
 				}
 			}
 		}
 	}
 
+	/* Close table if opened */
+	if (rel)
+		table_close(rel, NoLock);
+
+
 	if (started_tx)
 	{
 		/*
@@ -1413,7 +1438,8 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos)
 	UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
 							   MyLogicalRepWorker->relid,
 							   MyLogicalRepWorker->relstate,
-							   MyLogicalRepWorker->relstate_lsn);
+							   MyLogicalRepWorker->relstate_lsn,
+							   false);
 	CommitTransactionCommand();
 	pgstat_report_stat(true);
 
@@ -1546,7 +1572,8 @@ LogicalRepSyncTableStart(XLogRecPtr *origin_startpos)
 	UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
 							   MyLogicalRepWorker->relid,
 							   SUBREL_STATE_FINISHEDCOPY,
-							   MyLogicalRepWorker->relstate_lsn);
+							   MyLogicalRepWorker->relstate_lsn,
+							   false);
 
 	CommitTransactionCommand();
 
diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c
index c5fb627aa56..b59221c4d06 100644
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -132,6 +132,96 @@
  * failover = true when creating the subscription. Enabling failover allows us
  * to smoothly transition to the promoted standby, ensuring that we can
  * subscribe to the new primary without losing any data.
+ *
+ * RETAIN DEAD TUPLES
+ * ----------------------
+ * Each apply worker that enabled retain_dead_tuples option maintains a
+ * non-removable transaction ID (oldest_nonremovable_xid) in shared memory to
+ * prevent dead rows from being removed prematurely when the apply worker still
+ * needs them to detect conflicts reliably. This helps to retain the required
+ * commit_ts module information, which further helps to detect
+ * update_origin_differs and delete_origin_differs conflicts reliably, as
+ * otherwise, vacuum freeze could remove the required information.
+ *
+ * The logical replication launcher manages an internal replication slot named
+ * "pg_conflict_detection". It asynchronously aggregates the non-removable
+ * transaction ID from all apply workers to determine the appropriate xmin for
+ * the slot, thereby retaining necessary tuples.
+ *
+ * The non-removable transaction ID in the apply worker is advanced to the
+ * oldest running transaction ID once all concurrent transactions on the
+ * publisher have been applied and flushed locally. The process involves:
+ *
+ * - RDT_GET_CANDIDATE_XID:
+ *   Call GetOldestActiveTransactionId() to take oldestRunningXid as the
+ *   candidate xid.
+ *
+ * - RDT_REQUEST_PUBLISHER_STATUS:
+ *   Send a message to the walsender requesting the publisher status, which
+ *   includes the latest WAL write position and information about transactions
+ *   that are in the commit phase.
+ *
+ * - RDT_WAIT_FOR_PUBLISHER_STATUS:
+ *   Wait for the status from the walsender. After receiving the first status,
+ *   do not proceed if there are concurrent remote transactions that are still
+ *   in the commit phase. These transactions might have been assigned an
+ *   earlier commit timestamp but have not yet written the commit WAL record.
+ *   Continue to request the publisher status (RDT_REQUEST_PUBLISHER_STATUS)
+ *   until all these transactions have completed.
+ *
+ * - RDT_WAIT_FOR_LOCAL_FLUSH:
+ *   Advance the non-removable transaction ID if the current flush location has
+ *   reached or surpassed the last received WAL position.
+ *
+ * The overall state progression is: GET_CANDIDATE_XID ->
+ * REQUEST_PUBLISHER_STATUS -> WAIT_FOR_PUBLISHER_STATUS -> (loop to
+ * REQUEST_PUBLISHER_STATUS till concurrent remote transactions end) ->
+ * WAIT_FOR_LOCAL_FLUSH -> loop back to GET_CANDIDATE_XID.
+ *
+ * Retaining the dead tuples for this period is sufficient for ensuring
+ * eventual consistency using last-update-wins strategy, as dead tuples are
+ * useful for detecting conflicts only during the application of concurrent
+ * transactions from remote nodes. After applying and flushing all remote
+ * transactions that occurred concurrently with the tuple DELETE, any
+ * subsequent UPDATE from a remote node should have a later timestamp. In such
+ * cases, it is acceptable to detect an update_missing scenario and convert the
+ * UPDATE to an INSERT when applying it. But, detecting concurrent remote
+ * transactions with earlier timestamps than the DELETE is necessary, as the
+ * UPDATEs in remote transactions should be ignored if their timestamp is
+ * earlier than that of the dead tuples.
+ *
+ * Note that advancing the non-removable transaction ID is not supported if the
+ * publisher is also a physical standby. This is because the logical walsender
+ * on the standby can only get the WAL replay position but there may be more
+ * WALs that are being replicated from the primary and those WALs could have
+ * earlier commit timestamp.
+ *
+ * Similarly, when the publisher has subscribed to another publisher,
+ * information necessary for conflict detection cannot be retained for
+ * changes from origins other than the publisher. This is because publisher
+ * lacks the information on concurrent transactions of other publishers to
+ * which it subscribes. As the information on concurrent transactions is
+ * unavailable beyond subscriber's immediate publishers, the non-removable
+ * transaction ID might be advanced prematurely before changes from other
+ * origins have been fully applied.
+ *
+ * XXX Retaining information for changes from other origins might be possible
+ * by requesting the subscription on that origin to enable retain_dead_tuples
+ * and fetching the conflict detection slot.xmin along with the publisher's
+ * status. In the RDT_WAIT_FOR_PUBLISHER_STATUS phase, the apply worker could
+ * wait for the remote slot's xmin to reach the oldest active transaction ID,
+ * ensuring that all transactions from other origins have been applied on the
+ * publisher, thereby getting the latest WAL position that includes all
+ * concurrent changes. However, this approach may impact performance, so it
+ * might not worth the effort.
+ *
+ * XXX It seems feasible to get the latest commit's WAL location from the
+ * publisher and wait till that is applied. However, we can't do that
+ * because commit timestamps can regress as a commit with a later LSN is not
+ * guaranteed to have a later timestamp than those with earlier LSNs. Having
+ * said that, even if that is possible, it won't improve performance much as
+ * the apply always lag and moves slowly as compared with the transactions
+ * on the publisher.
  *-------------------------------------------------------------------------
  */
 
@@ -140,6 +230,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include "access/commit_ts.h"
 #include "access/table.h"
 #include "access/tableam.h"
 #include "access/twophase.h"
@@ -148,6 +239,7 @@
 #include "catalog/pg_inherits.h"
 #include "catalog/pg_subscription.h"
 #include "catalog/pg_subscription_rel.h"
+#include "commands/subscriptioncmds.h"
 #include "commands/tablecmds.h"
 #include "commands/trigger.h"
 #include "executor/executor.h"
@@ -166,12 +258,14 @@
 #include "replication/logicalrelation.h"
 #include "replication/logicalworker.h"
 #include "replication/origin.h"
+#include "replication/slot.h"
 #include "replication/walreceiver.h"
 #include "replication/worker_internal.h"
 #include "rewrite/rewriteHandler.h"
 #include "storage/buffile.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
+#include "storage/procarray.h"
 #include "tcop/tcopprot.h"
 #include "utils/acl.h"
 #include "utils/dynahash.h"
@@ -268,6 +362,78 @@ typedef enum
 	TRANS_PARALLEL_APPLY,
 } TransApplyAction;
 
+/*
+ * The phases involved in advancing the non-removable transaction ID.
+ *
+ * See comments atop worker.c for details of the transition between these
+ * phases.
+ */
+typedef enum
+{
+	RDT_GET_CANDIDATE_XID,
+	RDT_REQUEST_PUBLISHER_STATUS,
+	RDT_WAIT_FOR_PUBLISHER_STATUS,
+	RDT_WAIT_FOR_LOCAL_FLUSH
+} RetainDeadTuplesPhase;
+
+/*
+ * Critical information for managing phase transitions within the
+ * RetainDeadTuplesPhase.
+ */
+typedef struct RetainDeadTuplesData
+{
+	RetainDeadTuplesPhase phase;	/* current phase */
+	XLogRecPtr	remote_lsn;		/* WAL write position on the publisher */
+
+	/*
+	 * Oldest transaction ID that was in the commit phase on the publisher.
+	 * Use FullTransactionId to prevent issues with transaction ID wraparound,
+	 * where a new remote_oldestxid could falsely appear to originate from the
+	 * past and block advancement.
+	 */
+	FullTransactionId remote_oldestxid;
+
+	/*
+	 * Next transaction ID to be assigned on the publisher. Use
+	 * FullTransactionId for consistency and to allow straightforward
+	 * comparisons with remote_oldestxid.
+	 */
+	FullTransactionId remote_nextxid;
+
+	TimestampTz reply_time;		/* when the publisher responds with status */
+
+	/*
+	 * Publisher transaction ID that must be awaited to complete before
+	 * entering the final phase (RDT_WAIT_FOR_LOCAL_FLUSH). Use
+	 * FullTransactionId for the same reason as remote_nextxid.
+	 */
+	FullTransactionId remote_wait_for;
+
+	TransactionId candidate_xid;	/* candidate for the non-removable
+									 * transaction ID */
+	TimestampTz flushpos_update_time;	/* when the remote flush position was
+										 * updated in final phase
+										 * (RDT_WAIT_FOR_LOCAL_FLUSH) */
+
+	/*
+	 * The following fields are used to determine the timing for the next
+	 * round of transaction ID advancement.
+	 */
+	TimestampTz last_recv_time; /* when the last message was received */
+	TimestampTz candidate_xid_time; /* when the candidate_xid is decided */
+	int			xid_advance_interval;	/* how much time (ms) to wait before
+										 * attempting to advance the
+										 * non-removable transaction ID */
+} RetainDeadTuplesData;
+
+/*
+ * The minimum (100ms) and maximum (3 minutes) intervals for advancing
+ * non-removable transaction IDs. The maximum interval is a bit arbitrary but
+ * is sufficient to not cause any undue network traffic.
+ */
+#define MIN_XID_ADVANCE_INTERVAL 100
+#define MAX_XID_ADVANCE_INTERVAL 180000
+
 /* errcontext tracker */
 static ApplyErrorCallbackArg apply_error_callback_arg =
 {
@@ -332,6 +498,13 @@ static XLogRecPtr skip_xact_finish_lsn = InvalidXLogRecPtr;
 /* BufFile handle of the current streaming file */
 static BufFile *stream_fd = NULL;
 
+/*
+ * The remote WAL position that has been applied and flushed locally. We record
+ * and use this information both while sending feedback to the server and
+ * advancing oldest_nonremovable_xid.
+ */
+static XLogRecPtr last_flushpos = InvalidXLogRecPtr;
+
 typedef struct SubXactInfo
 {
 	TransactionId xid;			/* XID of the subxact */
@@ -372,6 +545,19 @@ static void stream_close_file(void);
 
 static void send_feedback(XLogRecPtr recvpos, bool force, bool requestReply);
 
+static void maybe_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data,
+										   bool status_received);
+static bool can_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data);
+static void process_rdt_phase_transition(RetainDeadTuplesData *rdt_data,
+										 bool status_received);
+static void get_candidate_xid(RetainDeadTuplesData *rdt_data);
+static void request_publisher_status(RetainDeadTuplesData *rdt_data);
+static void wait_for_publisher_status(RetainDeadTuplesData *rdt_data,
+									  bool status_received);
+static void wait_for_local_flush(RetainDeadTuplesData *rdt_data);
+static void adjust_xid_advance_interval(RetainDeadTuplesData *rdt_data,
+										bool new_xid_found);
+
 static void apply_handle_commit_internal(LogicalRepCommitData *commit_data);
 static void apply_handle_insert_internal(ApplyExecutionData *edata,
 										 ResultRelInfo *relinfo,
@@ -3577,6 +3763,7 @@ LogicalRepApplyLoop(XLogRecPtr last_received)
 	bool		ping_sent = false;
 	TimeLineID	tli;
 	ErrorContextCallback errcallback;
+	RetainDeadTuplesData rdt_data = {0};
 
 	/*
 	 * Init the ApplyMessageContext which we clean up after each replication
@@ -3655,6 +3842,8 @@ LogicalRepApplyLoop(XLogRecPtr last_received)
 					last_recv_timestamp = GetCurrentTimestamp();
 					ping_sent = false;
 
+					rdt_data.last_recv_time = last_recv_timestamp;
+
 					/* Ensure we are reading the data into our memory context. */
 					MemoryContextSwitchTo(ApplyMessageContext);
 
@@ -3681,6 +3870,8 @@ LogicalRepApplyLoop(XLogRecPtr last_received)
 						UpdateWorkerStats(last_received, send_time, false);
 
 						apply_dispatch(&s);
+
+						maybe_advance_nonremovable_xid(&rdt_data, false);
 					}
 					else if (c == 'k')
 					{
@@ -3696,8 +3887,31 @@ LogicalRepApplyLoop(XLogRecPtr last_received)
 							last_received = end_lsn;
 
 						send_feedback(last_received, reply_requested, false);
+
+						maybe_advance_nonremovable_xid(&rdt_data, false);
+
 						UpdateWorkerStats(last_received, timestamp, true);
 					}
+					else if (c == 's')	/* Primary status update */
+					{
+						rdt_data.remote_lsn = pq_getmsgint64(&s);
+						rdt_data.remote_oldestxid = FullTransactionIdFromU64((uint64) pq_getmsgint64(&s));
+						rdt_data.remote_nextxid = FullTransactionIdFromU64((uint64) pq_getmsgint64(&s));
+						rdt_data.reply_time = pq_getmsgint64(&s);
+
+						/*
+						 * This should never happen, see
+						 * ProcessStandbyPSRequestMessage. But if it happens
+						 * due to a bug, we don't want to proceed as it can
+						 * incorrectly advance oldest_nonremovable_xid.
+						 */
+						if (XLogRecPtrIsInvalid(rdt_data.remote_lsn))
+							elog(ERROR, "cannot get the latest WAL position from the publisher");
+
+						maybe_advance_nonremovable_xid(&rdt_data, true);
+
+						UpdateWorkerStats(last_received, rdt_data.reply_time, false);
+					}
 					/* other message types are purposefully ignored */
 
 					MemoryContextReset(ApplyMessageContext);
@@ -3710,6 +3924,11 @@ LogicalRepApplyLoop(XLogRecPtr last_received)
 		/* confirm all writes so far */
 		send_feedback(last_received, false, false);
 
+		/* Reset the timestamp if no message was received */
+		rdt_data.last_recv_time = 0;
+
+		maybe_advance_nonremovable_xid(&rdt_data, false);
+
 		if (!in_remote_transaction && !in_streamed_transaction)
 		{
 			/*
@@ -3744,6 +3963,14 @@ LogicalRepApplyLoop(XLogRecPtr last_received)
 		else
 			wait_time = NAPTIME_PER_CYCLE;
 
+		/*
+		 * Ensure to wake up when it's possible to advance the non-removable
+		 * transaction ID.
+		 */
+		if (rdt_data.phase == RDT_GET_CANDIDATE_XID &&
+			rdt_data.xid_advance_interval)
+			wait_time = Min(wait_time, rdt_data.xid_advance_interval);
+
 		rc = WaitLatchOrSocket(MyLatch,
 							   WL_SOCKET_READABLE | WL_LATCH_SET |
 							   WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
@@ -3807,6 +4034,8 @@ LogicalRepApplyLoop(XLogRecPtr last_received)
 
 			send_feedback(last_received, requestReply, requestReply);
 
+			maybe_advance_nonremovable_xid(&rdt_data, false);
+
 			/*
 			 * Force reporting to ensure long idle periods don't lead to
 			 * arbitrarily delayed stats. Stats can only be reported outside
@@ -3842,7 +4071,6 @@ send_feedback(XLogRecPtr recvpos, bool force, bool requestReply)
 
 	static XLogRecPtr last_recvpos = InvalidXLogRecPtr;
 	static XLogRecPtr last_writepos = InvalidXLogRecPtr;
-	static XLogRecPtr last_flushpos = InvalidXLogRecPtr;
 
 	XLogRecPtr	writepos;
 	XLogRecPtr	flushpos;
@@ -3921,6 +4149,367 @@ send_feedback(XLogRecPtr recvpos, bool force, bool requestReply)
 }
 
 /*
+ * Attempt to advance the non-removable transaction ID.
+ *
+ * See comments atop worker.c for details.
+ */
+static void
+maybe_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data,
+							   bool status_received)
+{
+	if (!can_advance_nonremovable_xid(rdt_data))
+		return;
+
+	process_rdt_phase_transition(rdt_data, status_received);
+}
+
+/*
+ * Preliminary check to determine if advancing the non-removable transaction ID
+ * is allowed.
+ */
+static bool
+can_advance_nonremovable_xid(RetainDeadTuplesData *rdt_data)
+{
+	/*
+	 * It is sufficient to manage non-removable transaction ID for a
+	 * subscription by the main apply worker to detect conflicts reliably even
+	 * for table sync or parallel apply workers.
+	 */
+	if (!am_leader_apply_worker())
+		return false;
+
+	/* No need to advance if retaining dead tuples is not required */
+	if (!MySubscription->retaindeadtuples)
+		return false;
+
+	return true;
+}
+
+/*
+ * Process phase transitions during the non-removable transaction ID
+ * advancement. See comments atop worker.c for details of the transition.
+ */
+static void
+process_rdt_phase_transition(RetainDeadTuplesData *rdt_data,
+							 bool status_received)
+{
+	switch (rdt_data->phase)
+	{
+		case RDT_GET_CANDIDATE_XID:
+			get_candidate_xid(rdt_data);
+			break;
+		case RDT_REQUEST_PUBLISHER_STATUS:
+			request_publisher_status(rdt_data);
+			break;
+		case RDT_WAIT_FOR_PUBLISHER_STATUS:
+			wait_for_publisher_status(rdt_data, status_received);
+			break;
+		case RDT_WAIT_FOR_LOCAL_FLUSH:
+			wait_for_local_flush(rdt_data);
+			break;
+	}
+}
+
+/*
+ * Workhorse for the RDT_GET_CANDIDATE_XID phase.
+ */
+static void
+get_candidate_xid(RetainDeadTuplesData *rdt_data)
+{
+	TransactionId oldest_running_xid;
+	TimestampTz now;
+
+	/*
+	 * Use last_recv_time when applying changes in the loop to avoid
+	 * unnecessary system time retrieval. If last_recv_time is not available,
+	 * obtain the current timestamp.
+	 */
+	now = rdt_data->last_recv_time ? rdt_data->last_recv_time : GetCurrentTimestamp();
+
+	/*
+	 * Compute the candidate_xid and request the publisher status at most once
+	 * per xid_advance_interval. Refer to adjust_xid_advance_interval() for
+	 * details on how this value is dynamically adjusted. This is to avoid
+	 * using CPU and network resources without making much progress.
+	 */
+	if (!TimestampDifferenceExceeds(rdt_data->candidate_xid_time, now,
+									rdt_data->xid_advance_interval))
+		return;
+
+	/*
+	 * Immediately update the timer, even if the function returns later
+	 * without setting candidate_xid due to inactivity on the subscriber. This
+	 * avoids frequent calls to GetOldestActiveTransactionId.
+	 */
+	rdt_data->candidate_xid_time = now;
+
+	/*
+	 * Consider transactions in the current database, as only dead tuples from
+	 * this database are required for conflict detection.
+	 */
+	oldest_running_xid = GetOldestActiveTransactionId(false, false);
+
+	/*
+	 * Oldest active transaction ID (oldest_running_xid) can't be behind any
+	 * of its previously computed value.
+	 */
+	Assert(TransactionIdPrecedesOrEquals(MyLogicalRepWorker->oldest_nonremovable_xid,
+										 oldest_running_xid));
+
+	/* Return if the oldest_nonremovable_xid cannot be advanced */
+	if (TransactionIdEquals(MyLogicalRepWorker->oldest_nonremovable_xid,
+							oldest_running_xid))
+	{
+		adjust_xid_advance_interval(rdt_data, false);
+		return;
+	}
+
+	adjust_xid_advance_interval(rdt_data, true);
+
+	rdt_data->candidate_xid = oldest_running_xid;
+	rdt_data->phase = RDT_REQUEST_PUBLISHER_STATUS;
+
+	/* process the next phase */
+	process_rdt_phase_transition(rdt_data, false);
+}
+
+/*
+ * Workhorse for the RDT_REQUEST_PUBLISHER_STATUS phase.
+ */
+static void
+request_publisher_status(RetainDeadTuplesData *rdt_data)
+{
+	static StringInfo request_message = NULL;
+
+	if (!request_message)
+	{
+		MemoryContext oldctx = MemoryContextSwitchTo(ApplyContext);
+
+		request_message = makeStringInfo();
+		MemoryContextSwitchTo(oldctx);
+	}
+	else
+		resetStringInfo(request_message);
+
+	/*
+	 * Send the current time to update the remote walsender's latest reply
+	 * message received time.
+	 */
+	pq_sendbyte(request_message, 'p');
+	pq_sendint64(request_message, GetCurrentTimestamp());
+
+	elog(DEBUG2, "sending publisher status request message");
+
+	/* Send a request for the publisher status */
+	walrcv_send(LogRepWorkerWalRcvConn,
+				request_message->data, request_message->len);
+
+	rdt_data->phase = RDT_WAIT_FOR_PUBLISHER_STATUS;
+
+	/*
+	 * Skip calling maybe_advance_nonremovable_xid() since further transition
+	 * is possible only once we receive the publisher status message.
+	 */
+}
+
+/*
+ * Workhorse for the RDT_WAIT_FOR_PUBLISHER_STATUS phase.
+ */
+static void
+wait_for_publisher_status(RetainDeadTuplesData *rdt_data,
+						  bool status_received)
+{
+	/*
+	 * Return if we have requested but not yet received the publisher status.
+	 */
+	if (!status_received)
+		return;
+
+	if (!FullTransactionIdIsValid(rdt_data->remote_wait_for))
+		rdt_data->remote_wait_for = rdt_data->remote_nextxid;
+
+	/*
+	 * Check if all remote concurrent transactions that were active at the
+	 * first status request have now completed. If completed, proceed to the
+	 * next phase; otherwise, continue checking the publisher status until
+	 * these transactions finish.
+	 *
+	 * It's possible that transactions in the commit phase during the last
+	 * cycle have now finished committing, but remote_oldestxid remains older
+	 * than remote_wait_for. This can happen if some old transaction came in
+	 * the commit phase when we requested status in this cycle. We do not
+	 * handle this case explicitly as it's rare and the benefit doesn't
+	 * justify the required complexity. Tracking would require either caching
+	 * all xids at the publisher or sending them to subscribers. The condition
+	 * will resolve naturally once the remaining transactions are finished.
+	 *
+	 * Directly advancing the non-removable transaction ID is possible if
+	 * there are no activities on the publisher since the last advancement
+	 * cycle. However, it requires maintaining two fields, last_remote_nextxid
+	 * and last_remote_lsn, within the structure for comparison with the
+	 * current cycle's values. Considering the minimal cost of continuing in
+	 * RDT_WAIT_FOR_LOCAL_FLUSH without awaiting changes, we opted not to
+	 * advance the transaction ID here.
+	 */
+	if (FullTransactionIdPrecedesOrEquals(rdt_data->remote_wait_for,
+										  rdt_data->remote_oldestxid))
+		rdt_data->phase = RDT_WAIT_FOR_LOCAL_FLUSH;
+	else
+		rdt_data->phase = RDT_REQUEST_PUBLISHER_STATUS;
+
+	/* process the next phase */
+	process_rdt_phase_transition(rdt_data, false);
+}
+
+/*
+ * Workhorse for the RDT_WAIT_FOR_LOCAL_FLUSH phase.
+ */
+static void
+wait_for_local_flush(RetainDeadTuplesData *rdt_data)
+{
+	Assert(!XLogRecPtrIsInvalid(rdt_data->remote_lsn) &&
+		   TransactionIdIsValid(rdt_data->candidate_xid));
+
+	/*
+	 * We expect the publisher and subscriber clocks to be in sync using time
+	 * sync service like NTP. Otherwise, we will advance this worker's
+	 * oldest_nonremovable_xid prematurely, leading to the removal of rows
+	 * required to detect conflicts reliably. This check primarily addresses
+	 * scenarios where the publisher's clock falls behind; if the publisher's
+	 * clock is ahead, subsequent transactions will naturally bear later
+	 * commit timestamps, conforming to the design outlined atop worker.c.
+	 *
+	 * XXX Consider waiting for the publisher's clock to catch up with the
+	 * subscriber's before proceeding to the next phase.
+	 */
+	if (TimestampDifferenceExceeds(rdt_data->reply_time,
+								   rdt_data->candidate_xid_time, 0))
+		ereport(ERROR,
+				errmsg_internal("oldest_nonremovable_xid transaction ID could be advanced prematurely"),
+				errdetail_internal("The clock on the publisher is behind that of the subscriber."));
+
+	/*
+	 * Do not attempt to advance the non-removable transaction ID when table
+	 * sync is in progress. During this time, changes from a single
+	 * transaction may be applied by multiple table sync workers corresponding
+	 * to the target tables. So, it's necessary for all table sync workers to
+	 * apply and flush the corresponding changes before advancing the
+	 * transaction ID, otherwise, dead tuples that are still needed for
+	 * conflict detection in table sync workers could be removed prematurely.
+	 * However, confirming the apply and flush progress across all table sync
+	 * workers is complex and not worth the effort, so we simply return if not
+	 * all tables are in the READY state.
+	 *
+	 * It is safe to add new tables with initial states to the subscription
+	 * after this check because any changes applied to these tables should
+	 * have a WAL position greater than the rdt_data->remote_lsn.
+	 */
+	if (!AllTablesyncsReady())
+		return;
+
+	/*
+	 * Update and check the remote flush position if we are applying changes
+	 * in a loop. This is done at most once per WalWriterDelay to avoid
+	 * performing costly operations in get_flush_position() too frequently
+	 * during change application.
+	 */
+	if (last_flushpos < rdt_data->remote_lsn && rdt_data->last_recv_time &&
+		TimestampDifferenceExceeds(rdt_data->flushpos_update_time,
+								   rdt_data->last_recv_time, WalWriterDelay))
+	{
+		XLogRecPtr	writepos;
+		XLogRecPtr	flushpos;
+		bool		have_pending_txes;
+
+		/* Fetch the latest remote flush position */
+		get_flush_position(&writepos, &flushpos, &have_pending_txes);
+
+		if (flushpos > last_flushpos)
+			last_flushpos = flushpos;
+
+		rdt_data->flushpos_update_time = rdt_data->last_recv_time;
+	}
+
+	/* Return to wait for the changes to be applied */
+	if (last_flushpos < rdt_data->remote_lsn)
+		return;
+
+	/*
+	 * Reaching here means the remote WAL position has been received, and all
+	 * transactions up to that position on the publisher have been applied and
+	 * flushed locally. So, we can advance the non-removable transaction ID.
+	 */
+	SpinLockAcquire(&MyLogicalRepWorker->relmutex);
+	MyLogicalRepWorker->oldest_nonremovable_xid = rdt_data->candidate_xid;
+	SpinLockRelease(&MyLogicalRepWorker->relmutex);
+
+	elog(DEBUG2, "confirmed flush up to remote lsn %X/%X: new oldest_nonremovable_xid %u",
+		 LSN_FORMAT_ARGS(rdt_data->remote_lsn),
+		 rdt_data->candidate_xid);
+
+	/* Notify launcher to update the xmin of the conflict slot */
+	ApplyLauncherWakeup();
+
+	/*
+	 * Reset all data fields except those used to determine the timing for the
+	 * next round of transaction ID advancement. We can even use
+	 * flushpos_update_time in the next round to decide whether to get the
+	 * latest flush position.
+	 */
+	rdt_data->phase = RDT_GET_CANDIDATE_XID;
+	rdt_data->remote_lsn = InvalidXLogRecPtr;
+	rdt_data->remote_oldestxid = InvalidFullTransactionId;
+	rdt_data->remote_nextxid = InvalidFullTransactionId;
+	rdt_data->reply_time = 0;
+	rdt_data->remote_wait_for = InvalidFullTransactionId;
+	rdt_data->candidate_xid = InvalidTransactionId;
+
+	/* process the next phase */
+	process_rdt_phase_transition(rdt_data, false);
+}
+
+/*
+ * Adjust the interval for advancing non-removable transaction IDs.
+ *
+ * We double the interval to try advancing the non-removable transaction IDs
+ * if there is no activity on the node. The maximum value of the interval is
+ * capped by wal_receiver_status_interval if it is not zero, otherwise to a
+ * 3 minutes which should be sufficient to avoid using CPU or network
+ * resources without much benefit.
+ *
+ * The interval is reset to a minimum value of 100ms once there is some
+ * activity on the node.
+ *
+ * XXX The use of wal_receiver_status_interval is a bit arbitrary so we can
+ * consider the other interval or a separate GUC if the need arises.
+ */
+static void
+adjust_xid_advance_interval(RetainDeadTuplesData *rdt_data, bool new_xid_found)
+{
+	if (!new_xid_found && rdt_data->xid_advance_interval)
+	{
+		int			max_interval = wal_receiver_status_interval
+			? wal_receiver_status_interval * 1000
+			: MAX_XID_ADVANCE_INTERVAL;
+
+		/*
+		 * No new transaction ID has been assigned since the last check, so
+		 * double the interval, but not beyond the maximum allowable value.
+		 */
+		rdt_data->xid_advance_interval = Min(rdt_data->xid_advance_interval * 2,
+											 max_interval);
+	}
+	else
+	{
+		/*
+		 * A new transaction ID was found or the interval is not yet
+		 * initialized, so set the interval to the minimum value.
+		 */
+		rdt_data->xid_advance_interval = MIN_XID_ADVANCE_INTERVAL;
+	}
+}
+
+/*
  * Exit routine for apply workers due to subscription parameter changes.
  */
 static void
@@ -4708,6 +5297,30 @@ InitializeLogRepWorker(void)
 		apply_worker_exit();
 	}
 
+	/*
+	 * Restart the worker if retain_dead_tuples was enabled during startup.
+	 *
+	 * At this point, the replication slot used for conflict detection might
+	 * not exist yet, or could be dropped soon if the launcher perceives
+	 * retain_dead_tuples as disabled. To avoid unnecessary tracking of
+	 * oldest_nonremovable_xid when the slot is absent or at risk of being
+	 * dropped, a restart is initiated.
+	 *
+	 * The oldest_nonremovable_xid should be initialized only when the
+	 * retain_dead_tuples is enabled before launching the worker. See
+	 * logicalrep_worker_launch.
+	 */
+	if (am_leader_apply_worker() &&
+		MySubscription->retaindeadtuples &&
+		!TransactionIdIsValid(MyLogicalRepWorker->oldest_nonremovable_xid))
+	{
+		ereport(LOG,
+				errmsg("logical replication worker for subscription \"%s\" will restart because the option %s was enabled during startup",
+					   MySubscription->name, "retain_dead_tuples"));
+
+		apply_worker_exit();
+	}
+
 	/* Setup synchronous commit according to the user's wishes */
 	SetConfigOption("synchronous_commit", MySubscription->synccommit,
 					PGC_BACKEND, PGC_S_OVERRIDE);
@@ -4864,6 +5477,14 @@ DisableSubscriptionAndExit(void)
 			errmsg("subscription \"%s\" has been disabled because of an error",
 				   MySubscription->name));
 
+	/*
+	 * Skip the track_commit_timestamp check when disabling the worker due to
+	 * an error, as verifying commit timestamps is unnecessary in this
+	 * context.
+	 */
+	if (MySubscription->retaindeadtuples)
+		CheckSubDeadTupleRetention(false, true, WARNING);
+
 	proc_exit(0);
 }
 
diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c
index 082b4d9d327..f4c977262c5 100644
--- a/src/backend/replication/pgoutput/pgoutput.c
+++ b/src/backend/replication/pgoutput/pgoutput.c
@@ -297,10 +297,12 @@ parse_output_parameters(List *options, PGOutputData *data)
 	bool		two_phase_option_given = false;
 	bool		origin_option_given = false;
 
+	/* Initialize optional parameters to defaults */
 	data->binary = false;
 	data->streaming = LOGICALREP_STREAM_OFF;
 	data->messages = false;
 	data->two_phase = false;
+	data->publish_no_origin = false;
 
 	foreach(lc, options)
 	{
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index e44ad576bc7..8605776ad86 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -47,6 +47,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postmaster/interrupt.h"
+#include "replication/logicallauncher.h"
 #include "replication/slotsync.h"
 #include "replication/slot.h"
 #include "replication/walsender_private.h"
@@ -172,6 +173,7 @@ static SyncStandbySlotsConfigData *synchronized_standby_slots_config;
 static XLogRecPtr ss_oldest_flush_lsn = InvalidXLogRecPtr;
 
 static void ReplicationSlotShmemExit(int code, Datum arg);
+static bool IsSlotForConflictCheck(const char *name);
 static void ReplicationSlotDropPtr(ReplicationSlot *slot);
 
 /* internal persistency functions */
@@ -258,13 +260,17 @@ ReplicationSlotShmemExit(int code, Datum arg)
 /*
  * Check whether the passed slot name is valid and report errors at elevel.
  *
+ * An error will be reported for a reserved replication slot name if
+ * allow_reserved_name is set to false.
+ *
  * Slot names may consist out of [a-z0-9_]{1,NAMEDATALEN-1} which should allow
  * the name to be used as a directory name on every supported OS.
  *
  * Returns whether the directory name is valid or not if elevel < ERROR.
  */
 bool
-ReplicationSlotValidateName(const char *name, int elevel)
+ReplicationSlotValidateName(const char *name, bool allow_reserved_name,
+							int elevel)
 {
 	const char *cp;
 
@@ -300,10 +306,32 @@ ReplicationSlotValidateName(const char *name, int elevel)
 			return false;
 		}
 	}
+
+	if (!allow_reserved_name && IsSlotForConflictCheck(name))
+	{
+		ereport(elevel,
+				errcode(ERRCODE_RESERVED_NAME),
+				errmsg("replication slot name \"%s\" is reserved",
+					   name),
+				errdetail("The name \"%s\" is reserved for the conflict detection slot.",
+						  CONFLICT_DETECTION_SLOT));
+
+		return false;
+	}
+
 	return true;
 }
 
 /*
+ * Return true if the replication slot name is "pg_conflict_detection".
+ */
+static bool
+IsSlotForConflictCheck(const char *name)
+{
+	return (strcmp(name, CONFLICT_DETECTION_SLOT) == 0);
+}
+
+/*
  * Create a new replication slot and mark it as used by this backend.
  *
  * name: Name of the slot
@@ -330,7 +358,12 @@ ReplicationSlotCreate(const char *name, bool db_specific,
 
 	Assert(MyReplicationSlot == NULL);
 
-	ReplicationSlotValidateName(name, ERROR);
+	/*
+	 * The logical launcher or pg_upgrade may create or migrate an internal
+	 * slot, so using a reserved name is allowed in these cases.
+	 */
+	ReplicationSlotValidateName(name, IsBinaryUpgrade || IsLogicalLauncher(),
+								ERROR);
 
 	if (failover)
 	{
@@ -582,6 +615,17 @@ retry:
 	}
 
 	/*
+	 * Do not allow users to acquire the reserved slot. This scenario may
+	 * occur if the launcher that owns the slot has terminated unexpectedly
+	 * due to an error, and a backend process attempts to reuse the slot.
+	 */
+	if (!IsLogicalLauncher() && IsSlotForConflictCheck(name))
+		ereport(ERROR,
+				errcode(ERRCODE_UNDEFINED_OBJECT),
+				errmsg("cannot acquire replication slot \"%s\"", name),
+				errdetail("The slot is reserved for conflict detection and can only be acquired by logical replication launcher."));
+
+	/*
 	 * This is the slot we want; check if it's active under some other
 	 * process.  In single user mode, we don't need this check.
 	 */
diff --git a/src/backend/replication/syncrep_scanner.l b/src/backend/replication/syncrep_scanner.l
index 7dec1f869c7..02004d621e7 100644
--- a/src/backend/replication/syncrep_scanner.l
+++ b/src/backend/replication/syncrep_scanner.l
@@ -157,17 +157,16 @@ syncrep_yyerror(SyncRepConfigData **syncrep_parse_result_p, char **syncrep_parse
 {
 	struct yyguts_t *yyg = (struct yyguts_t *) yyscanner;	/* needed for yytext
 															 * macro */
-	char *syncrep_parse_error_msg = *syncrep_parse_error_msg_p;
 
 	/* report only the first error in a parse operation */
-	if (syncrep_parse_error_msg)
+	if (*syncrep_parse_error_msg_p)
 		return;
 	if (yytext[0])
-		syncrep_parse_error_msg = psprintf("%s at or near \"%s\"",
-										   message, yytext);
+		*syncrep_parse_error_msg_p = psprintf("%s at or near \"%s\"",
+											  message, yytext);
 	else
-		syncrep_parse_error_msg = psprintf("%s at end of input",
-										   message);
+		*syncrep_parse_error_msg_p = psprintf("%s at end of input",
+											  message);
 }
 
 void
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 28b8591efa5..ee911394a23 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -65,6 +65,7 @@
 #include "funcapi.h"
 #include "libpq/libpq.h"
 #include "libpq/pqformat.h"
+#include "libpq/protocol.h"
 #include "miscadmin.h"
 #include "nodes/replnodes.h"
 #include "pgstat.h"
@@ -84,6 +85,7 @@
 #include "storage/ipc.h"
 #include "storage/pmsignal.h"
 #include "storage/proc.h"
+#include "storage/procarray.h"
 #include "tcop/dest.h"
 #include "tcop/tcopprot.h"
 #include "utils/acl.h"
@@ -258,6 +260,7 @@ static void StartLogicalReplication(StartReplicationCmd *cmd);
 static void ProcessStandbyMessage(void);
 static void ProcessStandbyReplyMessage(void);
 static void ProcessStandbyHSFeedbackMessage(void);
+static void ProcessStandbyPSRequestMessage(void);
 static void ProcessRepliesIfAny(void);
 static void ProcessPendingWrites(void);
 static void WalSndKeepalive(bool requestReply, XLogRecPtr writePtr);
@@ -733,13 +736,13 @@ HandleUploadManifestPacket(StringInfo buf, off_t *offset,
 
 	switch (mtype)
 	{
-		case 'd':				/* CopyData */
+		case PqMsg_CopyData:
 			maxmsglen = PQ_LARGE_MESSAGE_LIMIT;
 			break;
-		case 'c':				/* CopyDone */
-		case 'f':				/* CopyFail */
-		case 'H':				/* Flush */
-		case 'S':				/* Sync */
+		case PqMsg_CopyDone:
+		case PqMsg_CopyFail:
+		case PqMsg_Flush:
+		case PqMsg_Sync:
 			maxmsglen = PQ_SMALL_MESSAGE_LIMIT;
 			break;
 		default:
@@ -761,19 +764,19 @@ HandleUploadManifestPacket(StringInfo buf, off_t *offset,
 	/* Process the message */
 	switch (mtype)
 	{
-		case 'd':				/* CopyData */
+		case PqMsg_CopyData:
 			AppendIncrementalManifestData(ib, buf->data, buf->len);
 			return true;
 
-		case 'c':				/* CopyDone */
+		case PqMsg_CopyDone:
 			return false;
 
-		case 'H':				/* Sync */
-		case 'S':				/* Flush */
+		case PqMsg_Sync:
+		case PqMsg_Flush:
 			/* Ignore these while in CopyOut mode as we do elsewhere. */
 			return true;
 
-		case 'f':
+		case PqMsg_CopyFail:
 			ereport(ERROR,
 					(errcode(ERRCODE_QUERY_CANCELED),
 					 errmsg("COPY from stdin failed: %s",
@@ -1567,7 +1570,7 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
 		   tmpbuf.data, sizeof(int64));
 
 	/* output previously gathered data in a CopyData packet */
-	pq_putmessage_noblock('d', ctx->out->data, ctx->out->len);
+	pq_putmessage_noblock(PqMsg_CopyData, ctx->out->data, ctx->out->len);
 
 	CHECK_FOR_INTERRUPTS();
 
@@ -2303,7 +2306,7 @@ ProcessRepliesIfAny(void)
 			case PqMsg_CopyDone:
 				if (!streamingDoneSending)
 				{
-					pq_putmessage_noblock('c', NULL, 0);
+					pq_putmessage_noblock(PqMsg_CopyDone, NULL, 0);
 					streamingDoneSending = true;
 				}
 
@@ -2355,6 +2358,10 @@ ProcessStandbyMessage(void)
 			ProcessStandbyHSFeedbackMessage();
 			break;
 
+		case 'p':
+			ProcessStandbyPSRequestMessage();
+			break;
+
 		default:
 			ereport(COMMERROR,
 					(errcode(ERRCODE_PROTOCOL_VIOLATION),
@@ -2702,6 +2709,60 @@ ProcessStandbyHSFeedbackMessage(void)
 }
 
 /*
+ * Process the request for a primary status update message.
+ */
+static void
+ProcessStandbyPSRequestMessage(void)
+{
+	XLogRecPtr	lsn = InvalidXLogRecPtr;
+	TransactionId oldestXidInCommit;
+	FullTransactionId nextFullXid;
+	FullTransactionId fullOldestXidInCommit;
+	WalSnd	   *walsnd = MyWalSnd;
+	TimestampTz replyTime;
+
+	/*
+	 * This shouldn't happen because we don't support getting primary status
+	 * message from standby.
+	 */
+	if (RecoveryInProgress())
+		elog(ERROR, "the primary status is unavailable during recovery");
+
+	replyTime = pq_getmsgint64(&reply_message);
+
+	/*
+	 * Update shared state for this WalSender process based on reply data from
+	 * standby.
+	 */
+	SpinLockAcquire(&walsnd->mutex);
+	walsnd->replyTime = replyTime;
+	SpinLockRelease(&walsnd->mutex);
+
+	/*
+	 * Consider transactions in the current database, as only these are the
+	 * ones replicated.
+	 */
+	oldestXidInCommit = GetOldestActiveTransactionId(true, false);
+	nextFullXid = ReadNextFullTransactionId();
+	fullOldestXidInCommit = FullTransactionIdFromAllowableAt(nextFullXid,
+															 oldestXidInCommit);
+	lsn = GetXLogWriteRecPtr();
+
+	elog(DEBUG2, "sending primary status");
+
+	/* construct the message... */
+	resetStringInfo(&output_message);
+	pq_sendbyte(&output_message, 's');
+	pq_sendint64(&output_message, lsn);
+	pq_sendint64(&output_message, (int64) U64FromFullTransactionId(fullOldestXidInCommit));
+	pq_sendint64(&output_message, (int64) U64FromFullTransactionId(nextFullXid));
+	pq_sendint64(&output_message, GetCurrentTimestamp());
+
+	/* ... and send it wrapped in CopyData */
+	pq_putmessage_noblock(PqMsg_CopyData, output_message.data, output_message.len);
+}
+
+/*
  * Compute how long send/receive loops should sleep.
  *
  * If wal_sender_timeout is enabled we want to wake up in time to send
@@ -3246,7 +3307,7 @@ XLogSendPhysical(void)
 			wal_segment_close(xlogreader);
 
 		/* Send CopyDone */
-		pq_putmessage_noblock('c', NULL, 0);
+		pq_putmessage_noblock(PqMsg_CopyDone, NULL, 0);
 		streamingDoneSending = true;
 
 		WalSndCaughtUp = true;
@@ -3374,7 +3435,7 @@ retry:
 	memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)],
 		   tmpbuf.data, sizeof(int64));
 
-	pq_putmessage_noblock('d', output_message.data, output_message.len);
+	pq_putmessage_noblock(PqMsg_CopyData, output_message.data, output_message.len);
 
 	sentPtr = endptr;
 
@@ -4080,7 +4141,7 @@ WalSndKeepalive(bool requestReply, XLogRecPtr writePtr)
 	pq_sendbyte(&output_message, requestReply ? 1 : 0);
 
 	/* ... and send it wrapped in CopyData */
-	pq_putmessage_noblock('d', output_message.data, output_message.len);
+	pq_putmessage_noblock(PqMsg_CopyData, output_message.data, output_message.len);
 
 	/* Set local flag */
 	if (requestReply)
diff --git a/src/backend/storage/aio/README.md b/src/backend/storage/aio/README.md
index f10b5c7e31e..72ae3b3737d 100644
--- a/src/backend/storage/aio/README.md
+++ b/src/backend/storage/aio/README.md
@@ -94,7 +94,7 @@ pgaio_io_register_callbacks(ioh, PGAIO_HCB_SHARED_BUFFER_READV, 0);
  *
  * In this example we're reading only a single buffer, hence the 1.
  */
-pgaio_io_set_handle_data_32(ioh, (uint32 *) buffer, 1);
+pgaio_io_set_handle_data_32(ioh, (uint32 *) &buffer, 1);
 
 /*
  * Pass the AIO handle to lower-level function. When operating on the level of
@@ -119,8 +119,9 @@ pgaio_io_set_handle_data_32(ioh, (uint32 *) buffer, 1);
  * e.g. due to reaching a limit on the number of unsubmitted IOs, and even
  * complete before smgrstartreadv() returns.
  */
+void *page = BufferGetBlock(buffer);
 smgrstartreadv(ioh, operation->smgr, forknum, blkno,
-               BufferGetBlock(buffer), 1);
+               &page, 1);
 
 /*
  * To benefit from AIO, it is beneficial to perform other work, including
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 6afdd28dba6..67431208e7f 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -2743,12 +2743,10 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
 		 * because mdread doesn't complain about reads beyond EOF (when
 		 * zero_damaged_pages is ON) and so a previous attempt to read a block
 		 * beyond EOF could have left a "valid" zero-filled buffer.
-		 * Unfortunately, we have also seen this case occurring because of
-		 * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
-		 * that doesn't account for a recent write. In that situation, the
-		 * pre-existing buffer would contain valid data that we don't want to
-		 * overwrite.  Since the legitimate cases should always have left a
-		 * zero-filled buffer, complain if not PageIsNew.
+		 *
+		 * This has also been observed when relation was overwritten by
+		 * external process. Since the legitimate cases should always have
+		 * left a zero-filled buffer, complain if not PageIsNew.
 		 */
 		if (existing_id >= 0)
 		{
@@ -2778,8 +2776,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
 				ereport(ERROR,
 						(errmsg("unexpected data beyond EOF in block %u of relation %s",
 								existing_hdr->tag.blockNum,
-								relpath(bmr.smgr->smgr_rlocator, fork).str),
-						 errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
+								relpath(bmr.smgr->smgr_rlocator, fork).str)));
 
 			/*
 			 * We *must* do smgr[zero]extend before succeeding, else the page
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 3da9c41ee1d..3c0d20f4659 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -932,10 +932,11 @@ GetLocalBufferStorage(void)
 		num_bufs = Min(num_bufs, MaxAllocSize / BLCKSZ);
 
 		/* Buffers should be I/O aligned. */
-		cur_block = (char *)
-			TYPEALIGN(PG_IO_ALIGN_SIZE,
-					  MemoryContextAlloc(LocalBufferContext,
-										 num_bufs * BLCKSZ + PG_IO_ALIGN_SIZE));
+		cur_block = MemoryContextAllocAligned(LocalBufferContext,
+											  num_bufs * BLCKSZ,
+											  PG_IO_ALIGN_SIZE,
+											  0);
+
 		next_buf_in_block = 0;
 		num_bufs_in_block = num_bufs;
 	}
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index c6aefd2f688..beadeb5e46a 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -187,9 +187,11 @@ WaitLatch(Latch *latch, int wakeEvents, long timeout,
 	if (!(wakeEvents & WL_LATCH_SET))
 		latch = NULL;
 	ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch);
-	ModifyWaitEvent(LatchWaitSet, LatchWaitSetPostmasterDeathPos,
-					(wakeEvents & (WL_EXIT_ON_PM_DEATH | WL_POSTMASTER_DEATH)),
-					NULL);
+
+	if (IsUnderPostmaster)
+		ModifyWaitEvent(LatchWaitSet, LatchWaitSetPostmasterDeathPos,
+						(wakeEvents & (WL_EXIT_ON_PM_DEATH | WL_POSTMASTER_DEATH)),
+						NULL);
 
 	if (WaitEventSetWait(LatchWaitSet,
 						 (wakeEvents & WL_TIMEOUT) ? timeout : -1,
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 2418967def6..bf987aed8d3 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -2814,8 +2814,10 @@ GetRunningTransactionData(void)
  *
  * Similar to GetSnapshotData but returns just oldestActiveXid. We include
  * all PGPROCs with an assigned TransactionId, even VACUUM processes.
- * We look at all databases, though there is no need to include WALSender
- * since this has no effect on hot standby conflicts.
+ *
+ * If allDbs is true, we look at all databases, though there is no need to
+ * include WALSender since this has no effect on hot standby conflicts. If
+ * allDbs is false, skip processes attached to other databases.
  *
  * This is never executed during recovery so there is no need to look at
  * KnownAssignedXids.
@@ -2823,9 +2825,12 @@ GetRunningTransactionData(void)
  * We don't worry about updating other counters, we want to keep this as
  * simple as possible and leave GetSnapshotData() as the primary code for
  * that bookkeeping.
+ *
+ * inCommitOnly indicates getting the oldestActiveXid among the transactions
+ * in the commit critical section.
  */
 TransactionId
-GetOldestActiveTransactionId(void)
+GetOldestActiveTransactionId(bool inCommitOnly, bool allDbs)
 {
 	ProcArrayStruct *arrayP = procArray;
 	TransactionId *other_xids = ProcGlobal->xids;
@@ -2852,6 +2857,8 @@ GetOldestActiveTransactionId(void)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		TransactionId xid;
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
 
 		/* Fetch xid just once - see GetNewTransactionId */
 		xid = UINT32_ACCESS_ONCE(other_xids[index]);
@@ -2859,6 +2866,13 @@ GetOldestActiveTransactionId(void)
 		if (!TransactionIdIsNormal(xid))
 			continue;
 
+		if (inCommitOnly &&
+			(proc->delayChkptFlags & DELAY_CHKPT_IN_COMMIT) == 0)
+			continue;
+
+		if (!allDbs && proc->databaseId != MyDatabaseId)
+			continue;
+
 		if (TransactionIdPrecedes(xid, oldestRunningXid))
 			oldestRunningXid = xid;
 
diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
index a9bb540b55a..087821311cc 100644
--- a/src/backend/storage/ipc/procsignal.c
+++ b/src/backend/storage/ipc/procsignal.c
@@ -728,7 +728,11 @@ procsignal_sigusr1_handler(SIGNAL_ARGS)
 void
 SendCancelRequest(int backendPID, const uint8 *cancel_key, int cancel_key_len)
 {
-	Assert(backendPID != 0);
+	if (backendPID == 0)
+	{
+		ereport(LOG, (errmsg("invalid cancel request with PID 0")));
+		return;
+	}
 
 	/*
 	 * See if we have a matching backend. Reading the pss_pid and
diff --git a/src/backend/storage/lmgr/generate-lwlocknames.pl b/src/backend/storage/lmgr/generate-lwlocknames.pl
index 4441b7cba0c..cd3e43c448a 100644
--- a/src/backend/storage/lmgr/generate-lwlocknames.pl
+++ b/src/backend/storage/lmgr/generate-lwlocknames.pl
@@ -10,7 +10,6 @@ use Getopt::Long;
 my $output_path = '.';
 
 my $lastlockidx = -1;
-my $continue = "\n";
 
 GetOptions('outdir:s' => \$output_path);
 
@@ -28,18 +27,24 @@ print $h "/* there is deliberately not an #ifndef LWLOCKNAMES_H here */\n\n";
 
 
 #
-# First, record the predefined LWLocks listed in wait_event_names.txt.  We'll
-# cross-check those with the ones in lwlocklist.h.
+# First, record the predefined LWLocks and built-in tranches listed in
+# wait_event_names.txt.  We'll cross-check those with the ones in lwlocklist.h.
 #
+my @wait_event_tranches;
 my @wait_event_lwlocks;
 my $record_lwlocks = 0;
+my $in_tranches = 0;
 
 while (<$wait_event_names>)
 {
 	chomp;
 
 	# Check for end marker.
-	last if /^# END OF PREDEFINED LWLOCKS/;
+	if (/^# END OF PREDEFINED LWLOCKS/)
+	{
+		$in_tranches = 1;
+		next;
+	}
 
 	# Skip comments and empty lines.
 	next if /^#/;
@@ -55,13 +60,29 @@ while (<$wait_event_names>)
 	# Go to the next line if we are not yet recording LWLocks.
 	next if not $record_lwlocks;
 
+	# Stop recording if we reach another section.
+	last if /^Section:/;
+
 	# Record the LWLock.
 	(my $waiteventname, my $waitevendocsentence) = split(/\t/, $_);
-	push(@wait_event_lwlocks, $waiteventname);
+
+	if ($in_tranches)
+	{
+		push(@wait_event_tranches, $waiteventname);
+	}
+	else
+	{
+		push(@wait_event_lwlocks, $waiteventname);
+	}
 }
 
+#
+# While gathering the list of predefined LWLocks, cross-check the lists in
+# lwlocklist.h with the wait events we just recorded.
+#
 my $in_comment = 0;
-my $i = 0;
+my $lwlock_count = 0;
+my $tranche_count = 0;
 while (<$lwlocklist>)
 {
 	chomp;
@@ -82,40 +103,72 @@ while (<$lwlocklist>)
 		next;
 	}
 
-	die "unable to parse lwlocklist.h line \"$_\""
-	  unless /^PG_LWLOCK\((\d+),\s+(\w+)\)$/;
+	#
+	# Gather list of predefined LWLocks and cross-check with the wait events.
+	#
+	if (/^PG_LWLOCK\((\d+),\s+(\w+)\)$/)
+	{
+		my ($lockidx, $lockname) = ($1, $2);
 
-	(my $lockidx, my $lockname) = ($1, $2);
+		die "lwlocklist.h not in order" if $lockidx < $lastlockidx;
+		die "lwlocklist.h has duplicates" if $lockidx == $lastlockidx;
 
-	die "lwlocklist.h not in order" if $lockidx < $lastlockidx;
-	die "lwlocklist.h has duplicates" if $lockidx == $lastlockidx;
+		die "$lockname defined in lwlocklist.h but missing from "
+		  . "wait_event_names.txt"
+		  if $lwlock_count >= scalar @wait_event_lwlocks;
+		die "lists of predefined LWLocks do not match (first mismatch at "
+		  . "$wait_event_lwlocks[$lwlock_count] in wait_event_names.txt and "
+		  . "$lockname in lwlocklist.h)"
+		  if $wait_event_lwlocks[$lwlock_count] ne $lockname;
 
-	die "$lockname defined in lwlocklist.h but missing from "
-	  . "wait_event_names.txt"
-	  if $i >= scalar @wait_event_lwlocks;
-	die "lists of predefined LWLocks do not match (first mismatch at "
-	  . "$wait_event_lwlocks[$i] in wait_event_names.txt and $lockname in "
-	  . "lwlocklist.h)"
-	  if $wait_event_lwlocks[$i] ne $lockname;
-	$i++;
+		$lwlock_count++;
 
-	while ($lastlockidx < $lockidx - 1)
+		while ($lastlockidx < $lockidx - 1)
+		{
+			++$lastlockidx;
+		}
+		$lastlockidx = $lockidx;
+
+		# Add a "Lock" suffix to each lock name, as the C code depends on that.
+		printf $h "#define %-32s (&MainLWLockArray[$lockidx].lock)\n",
+		  $lockname . "Lock";
+
+		next;
+	}
+
+	#
+	# Cross-check the built-in LWLock tranches with the wait events.
+	#
+	if (/^PG_LWLOCKTRANCHE\((\w+),\s+(\w+)\)$/)
 	{
-		++$lastlockidx;
-		$continue = ",\n";
+		my ($tranche_id, $tranche_name) = ($1, $2);
+
+		die "$tranche_name defined in lwlocklist.h but missing from "
+		  . "wait_event_names.txt"
+		  if $tranche_count >= scalar @wait_event_tranches;
+		die
+		  "lists of built-in LWLock tranches do not match (first mismatch at "
+		  . "$wait_event_tranches[$tranche_count] in wait_event_names.txt and "
+		  . "$tranche_name in lwlocklist.h)"
+		  if $wait_event_tranches[$tranche_count] ne $tranche_name;
+
+		$tranche_count++;
+
+		next;
 	}
-	$lastlockidx = $lockidx;
-	$continue = ",\n";
 
-	# Add a "Lock" suffix to each lock name, as the C code depends on that
-	printf $h "#define %-32s (&MainLWLockArray[$lockidx].lock)\n",
-	  $lockname . "Lock";
+	die "unable to parse lwlocklist.h line \"$_\"";
 }
 
 die
-  "$wait_event_lwlocks[$i] defined in wait_event_names.txt but missing from "
-  . "lwlocklist.h"
-  if $i < scalar @wait_event_lwlocks;
+  "$wait_event_lwlocks[$lwlock_count] defined in wait_event_names.txt but "
+  . " missing from lwlocklist.h"
+  if $lwlock_count < scalar @wait_event_lwlocks;
+
+die
+  "$wait_event_tranches[$tranche_count] defined in wait_event_names.txt but "
+  . "missing from lwlocklist.h"
+  if $tranche_count < scalar @wait_event_tranches;
 
 print $h "\n";
 printf $h "#define NUM_INDIVIDUAL_LWLOCKS		%s\n", $lastlockidx + 1;
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 46f44bc4511..ec9c345ffdf 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -122,9 +122,8 @@ StaticAssertDecl((LW_VAL_EXCLUSIVE & LW_FLAG_MASK) == 0,
  * own tranche.  We absorb the names of these tranches from there into
  * BuiltinTrancheNames here.
  *
- * 2. There are some predefined tranches for built-in groups of locks.
- * These are listed in enum BuiltinTrancheIds in lwlock.h, and their names
- * appear in BuiltinTrancheNames[] below.
+ * 2. There are some predefined tranches for built-in groups of locks defined
+ * in lwlocklist.h.  We absorb the names of these tranches, too.
  *
  * 3. Extensions can create new tranches, via either RequestNamedLWLockTranche
  * or LWLockRegisterTranche.  The names of these that are known in the current
@@ -135,49 +134,10 @@ StaticAssertDecl((LW_VAL_EXCLUSIVE & LW_FLAG_MASK) == 0,
  */
 static const char *const BuiltinTrancheNames[] = {
 #define PG_LWLOCK(id, lockname) [id] = CppAsString(lockname),
+#define PG_LWLOCKTRANCHE(id, lockname) [LWTRANCHE_##id] = CppAsString(lockname),
 #include "storage/lwlocklist.h"
 #undef PG_LWLOCK
-	[LWTRANCHE_XACT_BUFFER] = "XactBuffer",
-	[LWTRANCHE_COMMITTS_BUFFER] = "CommitTsBuffer",
-	[LWTRANCHE_SUBTRANS_BUFFER] = "SubtransBuffer",
-	[LWTRANCHE_MULTIXACTOFFSET_BUFFER] = "MultiXactOffsetBuffer",
-	[LWTRANCHE_MULTIXACTMEMBER_BUFFER] = "MultiXactMemberBuffer",
-	[LWTRANCHE_NOTIFY_BUFFER] = "NotifyBuffer",
-	[LWTRANCHE_SERIAL_BUFFER] = "SerialBuffer",
-	[LWTRANCHE_WAL_INSERT] = "WALInsert",
-	[LWTRANCHE_BUFFER_CONTENT] = "BufferContent",
-	[LWTRANCHE_REPLICATION_ORIGIN_STATE] = "ReplicationOriginState",
-	[LWTRANCHE_REPLICATION_SLOT_IO] = "ReplicationSlotIO",
-	[LWTRANCHE_LOCK_FASTPATH] = "LockFastPath",
-	[LWTRANCHE_BUFFER_MAPPING] = "BufferMapping",
-	[LWTRANCHE_LOCK_MANAGER] = "LockManager",
-	[LWTRANCHE_PREDICATE_LOCK_MANAGER] = "PredicateLockManager",
-	[LWTRANCHE_PARALLEL_HASH_JOIN] = "ParallelHashJoin",
-	[LWTRANCHE_PARALLEL_BTREE_SCAN] = "ParallelBtreeScan",
-	[LWTRANCHE_PARALLEL_QUERY_DSA] = "ParallelQueryDSA",
-	[LWTRANCHE_PER_SESSION_DSA] = "PerSessionDSA",
-	[LWTRANCHE_PER_SESSION_RECORD_TYPE] = "PerSessionRecordType",
-	[LWTRANCHE_PER_SESSION_RECORD_TYPMOD] = "PerSessionRecordTypmod",
-	[LWTRANCHE_SHARED_TUPLESTORE] = "SharedTupleStore",
-	[LWTRANCHE_SHARED_TIDBITMAP] = "SharedTidBitmap",
-	[LWTRANCHE_PARALLEL_APPEND] = "ParallelAppend",
-	[LWTRANCHE_PER_XACT_PREDICATE_LIST] = "PerXactPredicateList",
-	[LWTRANCHE_PGSTATS_DSA] = "PgStatsDSA",
-	[LWTRANCHE_PGSTATS_HASH] = "PgStatsHash",
-	[LWTRANCHE_PGSTATS_DATA] = "PgStatsData",
-	[LWTRANCHE_LAUNCHER_DSA] = "LogicalRepLauncherDSA",
-	[LWTRANCHE_LAUNCHER_HASH] = "LogicalRepLauncherHash",
-	[LWTRANCHE_DSM_REGISTRY_DSA] = "DSMRegistryDSA",
-	[LWTRANCHE_DSM_REGISTRY_HASH] = "DSMRegistryHash",
-	[LWTRANCHE_COMMITTS_SLRU] = "CommitTsSLRU",
-	[LWTRANCHE_MULTIXACTOFFSET_SLRU] = "MultixactOffsetSLRU",
-	[LWTRANCHE_MULTIXACTMEMBER_SLRU] = "MultixactMemberSLRU",
-	[LWTRANCHE_NOTIFY_SLRU] = "NotifySLRU",
-	[LWTRANCHE_SERIAL_SLRU] = "SerialSLRU",
-	[LWTRANCHE_SUBTRANS_SLRU] = "SubtransSLRU",
-	[LWTRANCHE_XACT_SLRU] = "XactSLRU",
-	[LWTRANCHE_PARALLEL_VACUUM_DSA] = "ParallelVacuumDSA",
-	[LWTRANCHE_AIO_URING_COMPLETION] = "AioUringCompletion",
+#undef PG_LWLOCKTRANCHE
 };
 
 StaticAssertDecl(lengthof(BuiltinTrancheNames) ==
diff --git a/src/backend/tcop/backend_startup.c b/src/backend/tcop/backend_startup.c
index ad0af5edc1f..14d5fc0b196 100644
--- a/src/backend/tcop/backend_startup.c
+++ b/src/backend/tcop/backend_startup.c
@@ -492,7 +492,7 @@ static int
 ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done)
 {
 	int32		len;
-	char	   *buf;
+	char	   *buf = NULL;
 	ProtocolVersion proto;
 	MemoryContext oldcontext;
 
@@ -516,7 +516,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done)
 		 * scanners, which may be less benign, but it's not really our job to
 		 * notice those.)
 		 */
-		return STATUS_ERROR;
+		goto fail;
 	}
 
 	if (pq_getbytes(((char *) &len) + 1, 3) == EOF)
@@ -526,7 +526,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done)
 			ereport(COMMERROR,
 					(errcode(ERRCODE_PROTOCOL_VIOLATION),
 					 errmsg("incomplete startup packet")));
-		return STATUS_ERROR;
+		goto fail;
 	}
 
 	len = pg_ntoh32(len);
@@ -538,7 +538,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done)
 		ereport(COMMERROR,
 				(errcode(ERRCODE_PROTOCOL_VIOLATION),
 				 errmsg("invalid length of startup packet")));
-		return STATUS_ERROR;
+		goto fail;
 	}
 
 	/*
@@ -554,7 +554,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done)
 		ereport(COMMERROR,
 				(errcode(ERRCODE_PROTOCOL_VIOLATION),
 				 errmsg("incomplete startup packet")));
-		return STATUS_ERROR;
+		goto fail;
 	}
 	pq_endmsgread();
 
@@ -568,7 +568,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done)
 	{
 		ProcessCancelRequestPacket(port, buf, len);
 		/* Not really an error, but we don't want to proceed further */
-		return STATUS_ERROR;
+		goto fail;
 	}
 
 	if (proto == NEGOTIATE_SSL_CODE && !ssl_done)
@@ -607,14 +607,16 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done)
 			ereport(COMMERROR,
 					(errcode_for_socket_access(),
 					 errmsg("failed to send SSL negotiation response: %m")));
-			return STATUS_ERROR;	/* close the connection */
+			goto fail;			/* close the connection */
 		}
 
 #ifdef USE_SSL
 		if (SSLok == 'S' && secure_open_server(port) == -1)
-			return STATUS_ERROR;
+			goto fail;
 #endif
 
+		pfree(buf);
+
 		/*
 		 * At this point we should have no data already buffered.  If we do,
 		 * it was received before we performed the SSL handshake, so it wasn't
@@ -661,14 +663,16 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done)
 			ereport(COMMERROR,
 					(errcode_for_socket_access(),
 					 errmsg("failed to send GSSAPI negotiation response: %m")));
-			return STATUS_ERROR;	/* close the connection */
+			goto fail;			/* close the connection */
 		}
 
 #ifdef ENABLE_GSS
 		if (GSSok == 'G' && secure_open_gssapi(port) == -1)
-			return STATUS_ERROR;
+			goto fail;
 #endif
 
+		pfree(buf);
+
 		/*
 		 * At this point we should have no data already buffered.  If we do,
 		 * it was received before we performed the GSS handshake, so it wasn't
@@ -863,7 +867,16 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done)
 	 */
 	MemoryContextSwitchTo(oldcontext);
 
+	pfree(buf);
+
 	return STATUS_OK;
+
+fail:
+	/* be tidy, just to avoid Valgrind complaints */
+	if (buf)
+		pfree(buf);
+
+	return STATUS_ERROR;
 }
 
 /*
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 2f8c3d5f918..0cecd464902 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -988,6 +988,7 @@ pg_plan_queries(List *querytrees, const char *query_string, int cursorOptions,
 			stmt->stmt_location = query->stmt_location;
 			stmt->stmt_len = query->stmt_len;
 			stmt->queryId = query->queryId;
+			stmt->planOrigin = PLAN_STMT_INTERNAL;
 		}
 		else
 		{
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index d1593f38b35..08791b8f75e 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -1350,24 +1350,15 @@ PortalRunMulti(Portal portal,
 		PopActiveSnapshot();
 
 	/*
-	 * If a query completion data was supplied, use it.  Otherwise use the
-	 * portal's query completion data.
-	 *
-	 * Exception: Clients expect INSERT/UPDATE/DELETE tags to have counts, so
-	 * fake them with zeros.  This can happen with DO INSTEAD rules if there
-	 * is no replacement query of the same type as the original.  We print "0
-	 * 0" here because technically there is no query of the matching tag type,
-	 * and printing a non-zero count for a different query type seems wrong,
-	 * e.g.  an INSERT that does an UPDATE instead should not print "0 1" if
-	 * one row was updated.  See QueryRewrite(), step 3, for details.
+	 * If a command tag was requested and we did not fill in a run-time-
+	 * determined tag above, copy the parse-time tag from the Portal.  (There
+	 * might not be any tag there either, in edge cases such as empty prepared
+	 * statements.  That's OK.)
 	 */
-	if (qc && qc->commandTag == CMDTAG_UNKNOWN)
-	{
-		if (portal->qc.commandTag != CMDTAG_UNKNOWN)
-			CopyQueryCompletion(qc, &portal->qc);
-		/* If the caller supplied a qc, we should have set it by now. */
-		Assert(qc->commandTag != CMDTAG_UNKNOWN);
-	}
+	if (qc &&
+		qc->commandTag == CMDTAG_UNKNOWN &&
+		portal->qc.commandTag != CMDTAG_UNKNOWN)
+		CopyQueryCompletion(qc, &portal->qc);
 }
 
 /*
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 4c1faf5575c..4f4191b0ea6 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -1234,6 +1234,7 @@ ProcessUtilitySlow(ParseState *pstate,
 							wrapper->utilityStmt = stmt;
 							wrapper->stmt_location = pstmt->stmt_location;
 							wrapper->stmt_len = pstmt->stmt_len;
+							wrapper->planOrigin = PLAN_STMT_INTERNAL;
 
 							ProcessUtility(wrapper,
 										   queryString,
@@ -1964,6 +1965,7 @@ ProcessUtilityForAlterTable(Node *stmt, AlterTableUtilityContext *context)
 	wrapper->utilityStmt = stmt;
 	wrapper->stmt_location = context->pstmt->stmt_location;
 	wrapper->stmt_len = context->pstmt->stmt_len;
+	wrapper->planOrigin = PLAN_STMT_INTERNAL;
 
 	ProcessUtility(wrapper,
 				   context->queryString,
diff --git a/src/backend/tsearch/dict_ispell.c b/src/backend/tsearch/dict_ispell.c
index 63bd193a78a..debfbf956cc 100644
--- a/src/backend/tsearch/dict_ispell.c
+++ b/src/backend/tsearch/dict_ispell.c
@@ -47,24 +47,30 @@ dispell_init(PG_FUNCTION_ARGS)
 
 		if (strcmp(defel->defname, "dictfile") == 0)
 		{
+			char	   *filename;
+
 			if (dictloaded)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("multiple DictFile parameters")));
-			NIImportDictionary(&(d->obj),
-							   get_tsearch_config_filename(defGetString(defel),
-														   "dict"));
+			filename = get_tsearch_config_filename(defGetString(defel),
+												   "dict");
+			NIImportDictionary(&(d->obj), filename);
+			pfree(filename);
 			dictloaded = true;
 		}
 		else if (strcmp(defel->defname, "afffile") == 0)
 		{
+			char	   *filename;
+
 			if (affloaded)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("multiple AffFile parameters")));
-			NIImportAffixes(&(d->obj),
-							get_tsearch_config_filename(defGetString(defel),
-														"affix"));
+			filename = get_tsearch_config_filename(defGetString(defel),
+												   "affix");
+			NIImportAffixes(&(d->obj), filename);
+			pfree(filename);
 			affloaded = true;
 		}
 		else if (strcmp(defel->defname, "stopwords") == 0)
diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c
index 0da5a9d6868..c2773eb01ad 100644
--- a/src/backend/tsearch/dict_synonym.c
+++ b/src/backend/tsearch/dict_synonym.c
@@ -199,6 +199,7 @@ skipline:
 	}
 
 	tsearch_readline_end(&trst);
+	pfree(filename);
 
 	d->len = cur;
 	qsort(d->syn, d->len, sizeof(Syn), compareSyn);
diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c
index 1bebe36a691..1e6bbde1ca7 100644
--- a/src/backend/tsearch/dict_thesaurus.c
+++ b/src/backend/tsearch/dict_thesaurus.c
@@ -167,17 +167,17 @@ addWrd(DictThesaurus *d, char *b, char *e, uint32 idsubst, uint16 nwrd, uint16 p
 static void
 thesaurusRead(const char *filename, DictThesaurus *d)
 {
+	char	   *real_filename = get_tsearch_config_filename(filename, "ths");
 	tsearch_readline_state trst;
 	uint32		idsubst = 0;
 	bool		useasis = false;
 	char	   *line;
 
-	filename = get_tsearch_config_filename(filename, "ths");
-	if (!tsearch_readline_begin(&trst, filename))
+	if (!tsearch_readline_begin(&trst, real_filename))
 		ereport(ERROR,
 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
 				 errmsg("could not open thesaurus file \"%s\": %m",
-						filename)));
+						real_filename)));
 
 	while ((line = tsearch_readline(&trst)) != NULL)
 	{
@@ -297,6 +297,7 @@ thesaurusRead(const char *filename, DictThesaurus *d)
 	d->nsubst = idsubst;
 
 	tsearch_readline_end(&trst);
+	pfree(real_filename);
 }
 
 static TheLexeme *
diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c
index 8b57845e870..6bc91ce0dad 100644
--- a/src/backend/utils/activity/pgstat.c
+++ b/src/backend/utils/activity/pgstat.c
@@ -212,6 +212,11 @@ int			pgstat_fetch_consistency = PGSTAT_FETCH_CONSISTENCY_CACHE;
 
 PgStat_LocalState pgStatLocal;
 
+/*
+ * Track pending reports for fixed-numbered stats, used by
+ * pgstat_report_stat().
+ */
+bool		pgstat_report_fixed = false;
 
 /* ----------
  * Local data
@@ -370,7 +375,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE]
 		.shared_data_off = offsetof(PgStatShared_Backend, stats),
 		.shared_data_len = sizeof(((PgStatShared_Backend *) 0)->stats),
 
-		.have_static_pending_cb = pgstat_backend_have_pending_cb,
 		.flush_static_cb = pgstat_backend_flush_cb,
 		.reset_timestamp_cb = pgstat_backend_reset_timestamp_cb,
 	},
@@ -437,7 +441,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE]
 		.shared_data_len = sizeof(((PgStatShared_IO *) 0)->stats),
 
 		.flush_static_cb = pgstat_io_flush_cb,
-		.have_static_pending_cb = pgstat_io_have_pending_cb,
 		.init_shmem_cb = pgstat_io_init_shmem_cb,
 		.reset_all_cb = pgstat_io_reset_all_cb,
 		.snapshot_cb = pgstat_io_snapshot_cb,
@@ -455,7 +458,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE]
 		.shared_data_len = sizeof(((PgStatShared_SLRU *) 0)->stats),
 
 		.flush_static_cb = pgstat_slru_flush_cb,
-		.have_static_pending_cb = pgstat_slru_have_pending_cb,
 		.init_shmem_cb = pgstat_slru_init_shmem_cb,
 		.reset_all_cb = pgstat_slru_reset_all_cb,
 		.snapshot_cb = pgstat_slru_snapshot_cb,
@@ -474,7 +476,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE]
 
 		.init_backend_cb = pgstat_wal_init_backend_cb,
 		.flush_static_cb = pgstat_wal_flush_cb,
-		.have_static_pending_cb = pgstat_wal_have_pending_cb,
 		.init_shmem_cb = pgstat_wal_init_shmem_cb,
 		.reset_all_cb = pgstat_wal_reset_all_cb,
 		.snapshot_cb = pgstat_wal_snapshot_cb,
@@ -708,29 +709,10 @@ pgstat_report_stat(bool force)
 	}
 
 	/* Don't expend a clock check if nothing to do */
-	if (dlist_is_empty(&pgStatPending))
+	if (dlist_is_empty(&pgStatPending) &&
+		!pgstat_report_fixed)
 	{
-		bool		do_flush = false;
-
-		/* Check for pending stats */
-		for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++)
-		{
-			const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind);
-
-			if (!kind_info)
-				continue;
-			if (!kind_info->have_static_pending_cb)
-				continue;
-
-			if (kind_info->have_static_pending_cb())
-			{
-				do_flush = true;
-				break;
-			}
-		}
-
-		if (!do_flush)
-			return 0;
+		return 0;
 	}
 
 	/*
@@ -784,16 +766,19 @@ pgstat_report_stat(bool force)
 	partial_flush |= pgstat_flush_pending_entries(nowait);
 
 	/* flush of other stats kinds */
-	for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++)
+	if (pgstat_report_fixed)
 	{
-		const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind);
+		for (PgStat_Kind kind = PGSTAT_KIND_MIN; kind <= PGSTAT_KIND_MAX; kind++)
+		{
+			const PgStat_KindInfo *kind_info = pgstat_get_kind_info(kind);
 
-		if (!kind_info)
-			continue;
-		if (!kind_info->flush_static_cb)
-			continue;
+			if (!kind_info)
+				continue;
+			if (!kind_info->flush_static_cb)
+				continue;
 
-		partial_flush |= kind_info->flush_static_cb(nowait);
+			partial_flush |= kind_info->flush_static_cb(nowait);
+		}
 	}
 
 	last_flush = now;
@@ -815,6 +800,7 @@ pgstat_report_stat(bool force)
 	}
 
 	pending_since = 0;
+	pgstat_report_fixed = false;
 
 	return 0;
 }
diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c
index 51256277e8d..8714a85e2d9 100644
--- a/src/backend/utils/activity/pgstat_backend.c
+++ b/src/backend/utils/activity/pgstat_backend.c
@@ -66,6 +66,7 @@ pgstat_count_backend_io_op_time(IOObject io_object, IOContext io_context,
 				   io_time);
 
 	backend_has_iostats = true;
+	pgstat_report_fixed = true;
 }
 
 void
@@ -81,6 +82,7 @@ pgstat_count_backend_io_op(IOObject io_object, IOContext io_context,
 	PendingBackendStats.pending_io.bytes[io_object][io_context][io_op] += bytes;
 
 	backend_has_iostats = true;
+	pgstat_report_fixed = true;
 }
 
 /*
@@ -302,18 +304,6 @@ pgstat_flush_backend(bool nowait, bits32 flags)
 }
 
 /*
- * Check if there are any backend stats waiting for flush.
- */
-bool
-pgstat_backend_have_pending_cb(void)
-{
-	if (!pgstat_tracks_backend_bktype(MyBackendType))
-		return false;
-
-	return (backend_has_iostats || pgstat_backend_wal_have_pending());
-}
-
-/*
  * Callback to flush out locally pending backend statistics.
  *
  * If some stats could not be flushed due to lock contention, return true.
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index d8d26379a57..13ae57ed649 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -80,6 +80,7 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op,
 	pgstat_count_backend_io_op(io_object, io_context, io_op, cnt, bytes);
 
 	have_iostats = true;
+	pgstat_report_fixed = true;
 }
 
 /*
@@ -168,15 +169,6 @@ pgstat_fetch_stat_io(void)
 }
 
 /*
- * Check if there any IO stats waiting for flush.
- */
-bool
-pgstat_io_have_pending_cb(void)
-{
-	return have_iostats;
-}
-
-/*
  * Simpler wrapper of pgstat_io_flush_cb()
  */
 void
diff --git a/src/backend/utils/activity/pgstat_slru.c b/src/backend/utils/activity/pgstat_slru.c
index b9e940dde45..7bd8744accb 100644
--- a/src/backend/utils/activity/pgstat_slru.c
+++ b/src/backend/utils/activity/pgstat_slru.c
@@ -144,15 +144,6 @@ pgstat_get_slru_index(const char *name)
 }
 
 /*
- * Check if there are any SLRU stats entries waiting for flush.
- */
-bool
-pgstat_slru_have_pending_cb(void)
-{
-	return have_slrustats;
-}
-
-/*
  * Flush out locally pending SLRU stats entries
  *
  * If nowait is true, this function returns false on lock failure. Otherwise
@@ -247,6 +238,7 @@ get_slru_entry(int slru_idx)
 	Assert((slru_idx >= 0) && (slru_idx < SLRU_NUM_ELEMENTS));
 
 	have_slrustats = true;
+	pgstat_report_fixed = true;
 
 	return &pending_SLRUStats[slru_idx];
 }
diff --git a/src/backend/utils/activity/pgstat_wal.c b/src/backend/utils/activity/pgstat_wal.c
index 16a1ecb4d90..0d04480d2f6 100644
--- a/src/backend/utils/activity/pgstat_wal.c
+++ b/src/backend/utils/activity/pgstat_wal.c
@@ -72,6 +72,15 @@ pgstat_fetch_stat_wal(void)
 }
 
 /*
+ * To determine whether WAL usage happened.
+ */
+static inline bool
+pgstat_wal_have_pending(void)
+{
+	return pgWalUsage.wal_records != prevWalUsage.wal_records;
+}
+
+/*
  * Calculate how much WAL usage counters have increased by subtracting the
  * previous counters from the current ones.
  *
@@ -92,7 +101,7 @@ pgstat_wal_flush_cb(bool nowait)
 	 * This function can be called even if nothing at all has happened. Avoid
 	 * taking lock for nothing in that case.
 	 */
-	if (!pgstat_wal_have_pending_cb())
+	if (!pgstat_wal_have_pending())
 		return false;
 
 	/*
@@ -136,15 +145,6 @@ pgstat_wal_init_backend_cb(void)
 	prevWalUsage = pgWalUsage;
 }
 
-/*
- * To determine whether WAL usage happened.
- */
-bool
-pgstat_wal_have_pending_cb(void)
-{
-	return pgWalUsage.wal_records != prevWalUsage.wal_records;
-}
-
 void
 pgstat_wal_init_shmem_cb(void *stats)
 {
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 4da68312b5f..0be307d2ca0 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -356,9 +356,13 @@ AioWorkerSubmissionQueue	"Waiting to access AIO worker submission queue."
 #
 # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE)
 #
-# Predefined LWLocks (i.e., those declared in lwlocknames.h) must be listed
-# in the section above and must be listed in the same order as in
-# lwlocknames.h.  Other LWLocks must be listed in the section below.
+# Predefined LWLocks (i.e., those declared at the top of lwlocknames.h) must be
+# listed in the section above and must be listed in the same order as in
+# lwlocknames.h.
+#
+# Likewise, the built-in LWLock tranches (i.e., those declared at the bottom of
+# lwlocknames.h) must be listed in the section below and must be listed in the
+# same order as in lwlocknames.h.
 #
 
 XactBuffer	"Waiting for I/O on a transaction status SLRU buffer."
diff --git a/src/backend/utils/adt/bytea.c b/src/backend/utils/adt/bytea.c
index 2e539c2504e..6e7b914c563 100644
--- a/src/backend/utils/adt/bytea.c
+++ b/src/backend/utils/adt/bytea.c
@@ -182,27 +182,21 @@ bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
  *
  *		Non-printable characters must be passed as '\nnn' (octal) and are
  *		converted to internal form.  '\' must be passed as '\\'.
- *		ereport(ERROR, ...) if bad form.
- *
- *		BUGS:
- *				The input is scanned twice.
- *				The error checking of input is minimal.
  */
 Datum
 byteain(PG_FUNCTION_ARGS)
 {
 	char	   *inputText = PG_GETARG_CSTRING(0);
 	Node	   *escontext = fcinfo->context;
+	size_t		len = strlen(inputText);
+	size_t		bc;
 	char	   *tp;
 	char	   *rp;
-	int			bc;
 	bytea	   *result;
 
 	/* Recognize hex input */
 	if (inputText[0] == '\\' && inputText[1] == 'x')
 	{
-		size_t		len = strlen(inputText);
-
 		bc = (len - 2) / 2 + VARHDRSZ;	/* maximum possible length */
 		result = palloc(bc);
 		bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result),
@@ -213,33 +207,7 @@ byteain(PG_FUNCTION_ARGS)
 	}
 
 	/* Else, it's the traditional escaped style */
-	for (bc = 0, tp = inputText; *tp != '\0'; bc++)
-	{
-		if (tp[0] != '\\')
-			tp++;
-		else if ((tp[0] == '\\') &&
-				 (tp[1] >= '0' && tp[1] <= '3') &&
-				 (tp[2] >= '0' && tp[2] <= '7') &&
-				 (tp[3] >= '0' && tp[3] <= '7'))
-			tp += 4;
-		else if ((tp[0] == '\\') &&
-				 (tp[1] == '\\'))
-			tp += 2;
-		else
-		{
-			/*
-			 * one backslash, not followed by another or ### valid octal
-			 */
-			ereturn(escontext, (Datum) 0,
-					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
-					 errmsg("invalid input syntax for type %s", "bytea")));
-		}
-	}
-
-	bc += VARHDRSZ;
-
-	result = (bytea *) palloc(bc);
-	SET_VARSIZE(result, bc);
+	result = (bytea *) palloc(len + VARHDRSZ);	/* maximum possible length */
 
 	tp = inputText;
 	rp = VARDATA(result);
@@ -247,21 +215,21 @@ byteain(PG_FUNCTION_ARGS)
 	{
 		if (tp[0] != '\\')
 			*rp++ = *tp++;
-		else if ((tp[0] == '\\') &&
-				 (tp[1] >= '0' && tp[1] <= '3') &&
+		else if ((tp[1] >= '0' && tp[1] <= '3') &&
 				 (tp[2] >= '0' && tp[2] <= '7') &&
 				 (tp[3] >= '0' && tp[3] <= '7'))
 		{
-			bc = VAL(tp[1]);
-			bc <<= 3;
-			bc += VAL(tp[2]);
-			bc <<= 3;
-			*rp++ = bc + VAL(tp[3]);
+			int			v;
+
+			v = VAL(tp[1]);
+			v <<= 3;
+			v += VAL(tp[2]);
+			v <<= 3;
+			*rp++ = v + VAL(tp[3]);
 
 			tp += 4;
 		}
-		else if ((tp[0] == '\\') &&
-				 (tp[1] == '\\'))
+		else if (tp[1] == '\\')
 		{
 			*rp++ = '\\';
 			tp += 2;
@@ -269,7 +237,7 @@ byteain(PG_FUNCTION_ARGS)
 		else
 		{
 			/*
-			 * We should never get here. The first pass should not allow it.
+			 * one backslash, not followed by another or ### valid octal
 			 */
 			ereturn(escontext, (Datum) 0,
 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
@@ -277,6 +245,9 @@ byteain(PG_FUNCTION_ARGS)
 		}
 	}
 
+	bc = rp - VARDATA(result);	/* actual length */
+	SET_VARSIZE(result, bc + VARHDRSZ);
+
 	PG_RETURN_BYTEA_P(result);
 }
 
diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index c8b6c15e059..82b807d067a 100644
--- a/src/backend/utils/adt/jsonb_util.c
+++ b/src/backend/utils/adt/jsonb_util.c
@@ -277,22 +277,16 @@ compareJsonbContainers(JsonbContainer *a, JsonbContainer *b)
 		else
 		{
 			/*
-			 * It's safe to assume that the types differed, and that the va
-			 * and vb values passed were set.
-			 *
-			 * If the two values were of the same container type, then there'd
-			 * have been a chance to observe the variation in the number of
-			 * elements/pairs (when processing WJB_BEGIN_OBJECT, say). They're
-			 * either two heterogeneously-typed containers, or a container and
-			 * some scalar type.
-			 *
-			 * We don't have to consider the WJB_END_ARRAY and WJB_END_OBJECT
-			 * cases here, because we would have seen the corresponding
-			 * WJB_BEGIN_ARRAY and WJB_BEGIN_OBJECT tokens first, and
-			 * concluded that they don't match.
+			 * It's not possible for one iterator to report end of array or
+			 * object while the other one reports something else, because we
+			 * would have detected a length mismatch when we processed the
+			 * container-start tokens above.  Likewise we can't see WJB_DONE
+			 * from one but not the other.  So we have two different-type
+			 * containers, or a container and some scalar type, or two
+			 * different scalar types.  Sort on the basis of the type code.
 			 */
-			Assert(ra != WJB_END_ARRAY && ra != WJB_END_OBJECT);
-			Assert(rb != WJB_END_ARRAY && rb != WJB_END_OBJECT);
+			Assert(ra != WJB_DONE && ra != WJB_END_ARRAY && ra != WJB_END_OBJECT);
+			Assert(rb != WJB_DONE && rb != WJB_END_ARRAY && rb != WJB_END_OBJECT);
 
 			Assert(va.type != vb.type);
 			Assert(va.type != jbvBinary);
@@ -852,15 +846,20 @@ JsonbIteratorInit(JsonbContainer *container)
  * It is our job to expand the jbvBinary representation without bothering them
  * with it.  However, clients should not take it upon themselves to touch array
  * or Object element/pair buffers, since their element/pair pointers are
- * garbage.  Also, *val will not be set when returning WJB_END_ARRAY or
- * WJB_END_OBJECT, on the assumption that it's only useful to access values
- * when recursing in.
+ * garbage.
+ *
+ * *val is not meaningful when the result is WJB_DONE, WJB_END_ARRAY or
+ * WJB_END_OBJECT.  However, we set val->type = jbvNull in those cases,
+ * so that callers may assume that val->type is always well-defined.
  */
 JsonbIteratorToken
 JsonbIteratorNext(JsonbIterator **it, JsonbValue *val, bool skipNested)
 {
 	if (*it == NULL)
+	{
+		val->type = jbvNull;
 		return WJB_DONE;
+	}
 
 	/*
 	 * When stepping into a nested container, we jump back here to start
@@ -898,6 +897,7 @@ recurse:
 				 * nesting).
 				 */
 				*it = freeAndGetParent(*it);
+				val->type = jbvNull;
 				return WJB_END_ARRAY;
 			}
 
@@ -951,6 +951,7 @@ recurse:
 				 * of nesting).
 				 */
 				*it = freeAndGetParent(*it);
+				val->type = jbvNull;
 				return WJB_END_OBJECT;
 			}
 			else
@@ -995,8 +996,10 @@ recurse:
 				return WJB_VALUE;
 	}
 
-	elog(ERROR, "invalid iterator state");
-	return -1;
+	elog(ERROR, "invalid jsonb iterator state");
+	/* satisfy compilers that don't know that elog(ERROR) doesn't return */
+	val->type = jbvNull;
+	return WJB_DONE;
 }
 
 /*
diff --git a/src/backend/utils/adt/pg_upgrade_support.c b/src/backend/utils/adt/pg_upgrade_support.c
index d44f8c262ba..a4f8b4faa90 100644
--- a/src/backend/utils/adt/pg_upgrade_support.c
+++ b/src/backend/utils/adt/pg_upgrade_support.c
@@ -21,6 +21,7 @@
 #include "commands/extension.h"
 #include "miscadmin.h"
 #include "replication/logical.h"
+#include "replication/logicallauncher.h"
 #include "replication/origin.h"
 #include "replication/worker_internal.h"
 #include "storage/lmgr.h"
@@ -410,3 +411,21 @@ binary_upgrade_replorigin_advance(PG_FUNCTION_ARGS)
 
 	PG_RETURN_VOID();
 }
+
+/*
+ * binary_upgrade_create_conflict_detection_slot
+ *
+ * Create a replication slot to retain information necessary for conflict
+ * detection such as dead tuples, commit timestamps, and origins.
+ */
+Datum
+binary_upgrade_create_conflict_detection_slot(PG_FUNCTION_ARGS)
+{
+	CHECK_IS_BINARY_UPGRADE;
+
+	CreateConflictDetectionSlot();
+
+	ReplicationSlotRelease();
+
+	PG_RETURN_VOID();
+}
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index ce6a626eba2..17fbfa9b410 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3798,18 +3798,25 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
 								 List *hashclauses,
 								 Selectivity *innerbucketsize)
 {
-	List	   *clauses = list_copy(hashclauses);
-	List	   *otherclauses = NIL;
-	double		ndistinct = 1.0;
+	List	   *clauses;
+	List	   *otherclauses;
+	double		ndistinct;
 
 	if (list_length(hashclauses) <= 1)
-
+	{
 		/*
 		 * Nothing to do for a single clause.  Could we employ univariate
 		 * extended stat here?
 		 */
 		return hashclauses;
+	}
 
+	/* "clauses" is the list of hashclauses we've not dealt with yet */
+	clauses = list_copy(hashclauses);
+	/* "otherclauses" holds clauses we are going to return to caller */
+	otherclauses = NIL;
+	/* current estimate of ndistinct */
+	ndistinct = 1.0;
 	while (clauses != NIL)
 	{
 		ListCell   *lc;
@@ -3874,12 +3881,13 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
 					group_rel = root->simple_rel_array[relid];
 				}
 				else if (group_relid != relid)
-
+				{
 					/*
 					 * Being in the group forming state we don't need other
 					 * clauses.
 					 */
 					continue;
+				}
 
 				/*
 				 * We're going to add the new clause to the varinfos list.  We
diff --git a/src/backend/utils/adt/tid.c b/src/backend/utils/adt/tid.c
index 1b0df111717..39dab3e42df 100644
--- a/src/backend/utils/adt/tid.c
+++ b/src/backend/utils/adt/tid.c
@@ -84,7 +84,7 @@ tidin(PG_FUNCTION_ARGS)
 	/*
 	 * Cope with possibility that unsigned long is wider than BlockNumber, in
 	 * which case strtoul will not raise an error for some values that are out
-	 * of the range of BlockNumber.  (See similar code in oidin().)
+	 * of the range of BlockNumber.  (See similar code in uint32in_subr().)
 	 */
 #if SIZEOF_LONG > 4
 	if (cvt != (unsigned long) blockNumber &&
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index f7b731825fc..182e8f75db7 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -1769,7 +1769,7 @@ xml_doctype_in_content(const xmlChar *str)
  * xmloption_arg, but a DOCTYPE node in the input can force DOCUMENT mode).
  *
  * If parsed_nodes isn't NULL and we parse in CONTENT mode, the list
- * of parsed nodes from the xmlParseInNodeContext call will be returned
+ * of parsed nodes from the xmlParseBalancedChunkMemory call will be returned
  * to *parsed_nodes.  (It is caller's responsibility to free that.)
  *
  * Errors normally result in ereport(ERROR), but if escontext is an
@@ -1795,6 +1795,7 @@ xml_parse(text *data, XmlOptionType xmloption_arg,
 	PgXmlErrorContext *xmlerrcxt;
 	volatile xmlParserCtxtPtr ctxt = NULL;
 	volatile xmlDocPtr doc = NULL;
+	volatile int save_keep_blanks = -1;
 
 	/*
 	 * This step looks annoyingly redundant, but we must do it to have a
@@ -1822,7 +1823,6 @@ xml_parse(text *data, XmlOptionType xmloption_arg,
 	PG_TRY();
 	{
 		bool		parse_as_document = false;
-		int			options;
 		int			res_code;
 		size_t		count = 0;
 		xmlChar    *version = NULL;
@@ -1853,18 +1853,6 @@ xml_parse(text *data, XmlOptionType xmloption_arg,
 				parse_as_document = true;
 		}
 
-		/*
-		 * Select parse options.
-		 *
-		 * Note that here we try to apply DTD defaults (XML_PARSE_DTDATTR)
-		 * according to SQL/XML:2008 GR 10.16.7.d: 'Default values defined by
-		 * internal DTD are applied'.  As for external DTDs, we try to support
-		 * them too (see SQL/XML:2008 GR 10.16.7.e), but that doesn't really
-		 * happen because xmlPgEntityLoader prevents it.
-		 */
-		options = XML_PARSE_NOENT | XML_PARSE_DTDATTR
-			| (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS);
-
 		/* initialize output parameters */
 		if (parsed_xmloptiontype != NULL)
 			*parsed_xmloptiontype = parse_as_document ? XMLOPTION_DOCUMENT :
@@ -1874,11 +1862,26 @@ xml_parse(text *data, XmlOptionType xmloption_arg,
 
 		if (parse_as_document)
 		{
+			int			options;
+
+			/* set up parser context used by xmlCtxtReadDoc */
 			ctxt = xmlNewParserCtxt();
 			if (ctxt == NULL || xmlerrcxt->err_occurred)
 				xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
 							"could not allocate parser context");
 
+			/*
+			 * Select parse options.
+			 *
+			 * Note that here we try to apply DTD defaults (XML_PARSE_DTDATTR)
+			 * according to SQL/XML:2008 GR 10.16.7.d: 'Default values defined
+			 * by internal DTD are applied'.  As for external DTDs, we try to
+			 * support them too (see SQL/XML:2008 GR 10.16.7.e), but that
+			 * doesn't really happen because xmlPgEntityLoader prevents it.
+			 */
+			options = XML_PARSE_NOENT | XML_PARSE_DTDATTR
+				| (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS);
+
 			doc = xmlCtxtReadDoc(ctxt, utf8string,
 								 NULL,	/* no URL */
 								 "UTF-8",
@@ -1900,10 +1903,7 @@ xml_parse(text *data, XmlOptionType xmloption_arg,
 		}
 		else
 		{
-			xmlNodePtr	root;
-			xmlNodePtr	oldroot PG_USED_FOR_ASSERTS_ONLY;
-
-			/* set up document with empty root node to be the context node */
+			/* set up document that xmlParseBalancedChunkMemory will add to */
 			doc = xmlNewDoc(version);
 			if (doc == NULL || xmlerrcxt->err_occurred)
 				xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
@@ -1916,43 +1916,22 @@ xml_parse(text *data, XmlOptionType xmloption_arg,
 							"could not allocate XML document");
 			doc->standalone = standalone;
 
-			root = xmlNewNode(NULL, (const xmlChar *) "content-root");
-			if (root == NULL || xmlerrcxt->err_occurred)
-				xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
-							"could not allocate xml node");
-
-			/*
-			 * This attaches root to doc, so we need not free it separately;
-			 * and there can't yet be any old root to free.
-			 */
-			oldroot = xmlDocSetRootElement(doc, root);
-			Assert(oldroot == NULL);
+			/* set parse options --- have to do this the ugly way */
+			save_keep_blanks = xmlKeepBlanksDefault(preserve_whitespace ? 1 : 0);
 
 			/* allow empty content */
 			if (*(utf8string + count))
 			{
-				xmlNodePtr	node_list = NULL;
-				xmlParserErrors res;
-
-				res = xmlParseInNodeContext(root,
-											(char *) utf8string + count,
-											strlen((char *) utf8string + count),
-											options,
-											&node_list);
-
-				if (res != XML_ERR_OK || xmlerrcxt->err_occurred)
+				res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0,
+													   utf8string + count,
+													   parsed_nodes);
+				if (res_code != 0 || xmlerrcxt->err_occurred)
 				{
-					xmlFreeNodeList(node_list);
 					xml_errsave(escontext, xmlerrcxt,
 								ERRCODE_INVALID_XML_CONTENT,
 								"invalid XML content");
 					goto fail;
 				}
-
-				if (parsed_nodes != NULL)
-					*parsed_nodes = node_list;
-				else
-					xmlFreeNodeList(node_list);
 			}
 		}
 
@@ -1961,6 +1940,8 @@ fail:
 	}
 	PG_CATCH();
 	{
+		if (save_keep_blanks != -1)
+			xmlKeepBlanksDefault(save_keep_blanks);
 		if (doc != NULL)
 			xmlFreeDoc(doc);
 		if (ctxt != NULL)
@@ -1972,6 +1953,9 @@ fail:
 	}
 	PG_END_TRY();
 
+	if (save_keep_blanks != -1)
+		xmlKeepBlanksDefault(save_keep_blanks);
+
 	if (ctxt != NULL)
 		xmlFreeParserCtxt(ctxt);
 
diff --git a/src/backend/utils/cache/evtcache.c b/src/backend/utils/cache/evtcache.c
index ce596bf5638..b9d5a5998be 100644
--- a/src/backend/utils/cache/evtcache.c
+++ b/src/backend/utils/cache/evtcache.c
@@ -78,7 +78,6 @@ BuildEventTriggerCache(void)
 {
 	HASHCTL		ctl;
 	HTAB	   *cache;
-	MemoryContext oldcontext;
 	Relation	rel;
 	Relation	irel;
 	SysScanDesc scan;
@@ -110,9 +109,6 @@ BuildEventTriggerCache(void)
 									  (Datum) 0);
 	}
 
-	/* Switch to correct memory context. */
-	oldcontext = MemoryContextSwitchTo(EventTriggerCacheContext);
-
 	/* Prevent the memory context from being nuked while we're rebuilding. */
 	EventTriggerCacheState = ETCS_REBUILD_STARTED;
 
@@ -145,6 +141,7 @@ BuildEventTriggerCache(void)
 		bool		evttags_isnull;
 		EventTriggerCacheEntry *entry;
 		bool		found;
+		MemoryContext oldcontext;
 
 		/* Get next tuple. */
 		tup = systable_getnext_ordered(scan, ForwardScanDirection);
@@ -171,6 +168,9 @@ BuildEventTriggerCache(void)
 		else
 			continue;
 
+		/* Switch to correct memory context. */
+		oldcontext = MemoryContextSwitchTo(EventTriggerCacheContext);
+
 		/* Allocate new cache item. */
 		item = palloc0(sizeof(EventTriggerCacheItem));
 		item->fnoid = form->evtfoid;
@@ -188,6 +188,9 @@ BuildEventTriggerCache(void)
 			entry->triggerlist = lappend(entry->triggerlist, item);
 		else
 			entry->triggerlist = list_make1(item);
+
+		/* Restore previous memory context. */
+		MemoryContextSwitchTo(oldcontext);
 	}
 
 	/* Done with pg_event_trigger scan. */
@@ -195,9 +198,6 @@ BuildEventTriggerCache(void)
 	index_close(irel, AccessShareLock);
 	relation_close(rel, AccessShareLock);
 
-	/* Restore previous memory context. */
-	MemoryContextSwitchTo(oldcontext);
-
 	/* Install new cache. */
 	EventTriggerCache = cache;
 
@@ -240,6 +240,8 @@ DecodeTextArrayToBitmapset(Datum array)
 	}
 
 	pfree(elems);
+	if ((Pointer) arr != DatumGetPointer(array))
+		pfree(arr);
 
 	return bms;
 }
diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c
index 89a1c79e984..6661d2c6b73 100644
--- a/src/backend/utils/cache/plancache.c
+++ b/src/backend/utils/cache/plancache.c
@@ -463,8 +463,7 @@ CompleteCachedPlan(CachedPlanSource *plansource,
 
 	/*
 	 * Save the final parameter types (or other parameter specification data)
-	 * into the source_context, as well as our other parameters.  Also save
-	 * the result tuple descriptor.
+	 * into the source_context, as well as our other parameters.
 	 */
 	MemoryContextSwitchTo(source_context);
 
@@ -480,9 +479,25 @@ CompleteCachedPlan(CachedPlanSource *plansource,
 	plansource->parserSetupArg = parserSetupArg;
 	plansource->cursor_options = cursor_options;
 	plansource->fixed_result = fixed_result;
-	plansource->resultDesc = PlanCacheComputeResultDesc(querytree_list);
 
+	/*
+	 * Also save the result tuple descriptor.  PlanCacheComputeResultDesc may
+	 * leak some cruft; normally we just accept that to save a copy step, but
+	 * in USE_VALGRIND mode be tidy by running it in the caller's context.
+	 */
+#ifdef USE_VALGRIND
+	MemoryContextSwitchTo(oldcxt);
+	plansource->resultDesc = PlanCacheComputeResultDesc(querytree_list);
+	if (plansource->resultDesc)
+	{
+		MemoryContextSwitchTo(source_context);
+		plansource->resultDesc = CreateTupleDescCopy(plansource->resultDesc);
+		MemoryContextSwitchTo(oldcxt);
+	}
+#else
+	plansource->resultDesc = PlanCacheComputeResultDesc(querytree_list);
 	MemoryContextSwitchTo(oldcxt);
+#endif
 
 	plansource->is_complete = true;
 	plansource->is_valid = true;
@@ -1283,6 +1298,7 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams,
 	CachedPlan *plan = NULL;
 	List	   *qlist;
 	bool		customplan;
+	ListCell   *lc;
 
 	/* Assert caller is doing things in a sane order */
 	Assert(plansource->magic == CACHEDPLANSOURCE_MAGIC);
@@ -1385,6 +1401,13 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams,
 		plan->is_saved = true;
 	}
 
+	foreach(lc, plan->stmt_list)
+	{
+		PlannedStmt *pstmt = (PlannedStmt *) lfirst(lc);
+
+		pstmt->planOrigin = customplan ? PLAN_STMT_CACHE_CUSTOM : PLAN_STMT_CACHE_GENERIC;
+	}
+
 	return plan;
 }
 
diff --git a/src/backend/utils/cache/ts_cache.c b/src/backend/utils/cache/ts_cache.c
index 18cccd778fd..e8ae53238d0 100644
--- a/src/backend/utils/cache/ts_cache.c
+++ b/src/backend/utils/cache/ts_cache.c
@@ -321,7 +321,9 @@ lookup_ts_dictionary_cache(Oid dictId)
 
 			/*
 			 * Init method runs in dictionary's private memory context, and we
-			 * make sure the options are stored there too
+			 * make sure the options are stored there too.  This typically
+			 * results in a small amount of memory leakage, but it's not worth
+			 * complicating the API for tmplinit functions to avoid it.
 			 */
 			oldcontext = MemoryContextSwitchTo(entry->dictCtx);
 
diff --git a/src/backend/utils/cache/typcache.c b/src/backend/utils/cache/typcache.c
index f9aec38a11f..6a347698edf 100644
--- a/src/backend/utils/cache/typcache.c
+++ b/src/backend/utils/cache/typcache.c
@@ -1171,9 +1171,6 @@ load_domaintype_info(TypeCacheEntry *typentry)
 				elog(ERROR, "domain \"%s\" constraint \"%s\" has NULL conbin",
 					 NameStr(typTup->typname), NameStr(c->conname));
 
-			/* Convert conbin to C string in caller context */
-			constring = TextDatumGetCString(val);
-
 			/* Create the DomainConstraintCache object and context if needed */
 			if (dcc == NULL)
 			{
@@ -1189,9 +1186,8 @@ load_domaintype_info(TypeCacheEntry *typentry)
 				dcc->dccRefCount = 0;
 			}
 
-			/* Create node trees in DomainConstraintCache's context */
-			oldcxt = MemoryContextSwitchTo(dcc->dccContext);
-
+			/* Convert conbin to a node tree, still in caller's context */
+			constring = TextDatumGetCString(val);
 			check_expr = (Expr *) stringToNode(constring);
 
 			/*
@@ -1206,10 +1202,13 @@ load_domaintype_info(TypeCacheEntry *typentry)
 			 */
 			check_expr = expression_planner(check_expr);
 
+			/* Create only the minimally needed stuff in dccContext */
+			oldcxt = MemoryContextSwitchTo(dcc->dccContext);
+
 			r = makeNode(DomainConstraintState);
 			r->constrainttype = DOM_CONSTRAINT_CHECK;
 			r->name = pstrdup(NameStr(c->conname));
-			r->check_expr = check_expr;
+			r->check_expr = copyObject(check_expr);
 			r->check_exprstate = NULL;
 
 			MemoryContextSwitchTo(oldcxt);
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 1ad155d446e..81da03629f0 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -22,10 +22,11 @@
  * lookup key's hash value as a partition number --- this will work because
  * of the way calc_bucket() maps hash values to bucket numbers.
  *
- * For hash tables in shared memory, the memory allocator function should
- * match malloc's semantics of returning NULL on failure.  For hash tables
- * in local memory, we typically use palloc() which will throw error on
- * failure.  The code in this file has to cope with both cases.
+ * The memory allocator function should match malloc's semantics of returning
+ * NULL on failure.  (This is essential for hash tables in shared memory.
+ * For hash tables in local memory, we used to use palloc() which will throw
+ * error on failure; but we no longer do, so it's untested whether this
+ * module will still cope with that behavior.)
  *
  * dynahash.c provides support for these types of lookup keys:
  *
@@ -98,6 +99,7 @@
 
 #include "access/xact.h"
 #include "common/hashfn.h"
+#include "lib/ilist.h"
 #include "port/pg_bitutils.h"
 #include "storage/shmem.h"
 #include "storage/spin.h"
@@ -195,6 +197,7 @@ struct HASHHDR
 	long		ssize;			/* segment size --- must be power of 2 */
 	int			sshift;			/* segment shift = log2(ssize) */
 	int			nelem_alloc;	/* number of entries to allocate at once */
+	bool		isfixed;		/* if true, don't enlarge */
 
 #ifdef HASH_STATISTICS
 
@@ -227,7 +230,6 @@ struct HTAB
 	MemoryContext hcxt;			/* memory context if default allocator used */
 	char	   *tabname;		/* table name (for error messages) */
 	bool		isshared;		/* true if table is in shared memory */
-	bool		isfixed;		/* if true, don't enlarge */
 
 	/* freezing a shared table isn't allowed, so we can keep state here */
 	bool		frozen;			/* true = no more inserts allowed */
@@ -236,6 +238,16 @@ struct HTAB
 	Size		keysize;		/* hash key length in bytes */
 	long		ssize;			/* segment size --- must be power of 2 */
 	int			sshift;			/* segment shift = log2(ssize) */
+
+	/*
+	 * In a USE_VALGRIND build, non-shared hashtables keep an slist chain of
+	 * all the element blocks they have allocated.  This pacifies Valgrind,
+	 * which would otherwise often claim that the element blocks are "possibly
+	 * lost" for lack of any non-interior pointers to their starts.
+	 */
+#ifdef USE_VALGRIND
+	slist_head	element_blocks;
+#endif
 };
 
 /*
@@ -618,8 +630,10 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
 		}
 	}
 
+	/* Set isfixed if requested, but not till after we build initial entries */
 	if (flags & HASH_FIXED_SIZE)
-		hashp->isfixed = true;
+		hctl->isfixed = true;
+
 	return hashp;
 }
 
@@ -644,6 +658,8 @@ hdefault(HTAB *hashp)
 	hctl->ssize = DEF_SEGSIZE;
 	hctl->sshift = DEF_SEGSIZE_SHIFT;
 
+	hctl->isfixed = false;		/* can be enlarged */
+
 #ifdef HASH_STATISTICS
 	hctl->accesses = hctl->collisions = 0;
 #endif
@@ -1708,23 +1724,51 @@ element_alloc(HTAB *hashp, int nelem, int freelist_idx)
 {
 	HASHHDR    *hctl = hashp->hctl;
 	Size		elementSize;
+	Size		requestSize;
+	char	   *allocedBlock;
 	HASHELEMENT *firstElement;
 	HASHELEMENT *tmpElement;
 	HASHELEMENT *prevElement;
 	int			i;
 
-	if (hashp->isfixed)
+	if (hctl->isfixed)
 		return false;
 
 	/* Each element has a HASHELEMENT header plus user data. */
 	elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
 
+	requestSize = nelem * elementSize;
+
+	/* Add space for slist_node list link if we need one. */
+#ifdef USE_VALGRIND
+	if (!hashp->isshared)
+		requestSize += MAXALIGN(sizeof(slist_node));
+#endif
+
+	/* Allocate the memory. */
 	CurrentDynaHashCxt = hashp->hcxt;
-	firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
+	allocedBlock = hashp->alloc(requestSize);
 
-	if (!firstElement)
+	if (!allocedBlock)
 		return false;
 
+	/*
+	 * If USE_VALGRIND, each allocated block of elements of a non-shared
+	 * hashtable is chained into a list, so that Valgrind won't think it's
+	 * been leaked.
+	 */
+#ifdef USE_VALGRIND
+	if (hashp->isshared)
+		firstElement = (HASHELEMENT *) allocedBlock;
+	else
+	{
+		slist_push_head(&hashp->element_blocks, (slist_node *) allocedBlock);
+		firstElement = (HASHELEMENT *) (allocedBlock + MAXALIGN(sizeof(slist_node)));
+	}
+#else
+	firstElement = (HASHELEMENT *) allocedBlock;
+#endif
+
 	/* prepare to link all the new entries into the freelist */
 	prevElement = NULL;
 	tmpElement = firstElement;
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index 43b4dbccc3d..65d8cbfaed5 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -1183,7 +1183,6 @@ UnlinkLockFiles(int status, Datum arg)
 		/* Should we complain if the unlink fails? */
 	}
 	/* Since we're about to exit, no need to reclaim storage */
-	lock_files = NIL;
 
 	/*
 	 * Lock file removal should always be the last externally visible action
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index c86ceefda94..641e535a73c 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -417,12 +417,11 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect
 	datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datctype);
 	ctype = TextDatumGetCString(datum);
 
-	if (pg_perm_setlocale(LC_COLLATE, collate) == NULL)
-		ereport(FATAL,
-				(errmsg("database locale is incompatible with operating system"),
-				 errdetail("The database was initialized with LC_COLLATE \"%s\", "
-						   " which is not recognized by setlocale().", collate),
-				 errhint("Recreate the database with another locale or install the missing locale.")));
+	/*
+	 * Historcally, we set LC_COLLATE from datcollate, as well. That's no
+	 * longer necessary because all collation behavior is handled through
+	 * pg_locale_t.
+	 */
 
 	if (pg_perm_setlocale(LC_CTYPE, ctype) == NULL)
 		ereport(FATAL,
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 667df448732..e404c345e6e 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -249,6 +249,7 @@ static void reapply_stacked_values(struct config_generic *variable,
 								   const char *curvalue,
 								   GucContext curscontext, GucSource cursource,
 								   Oid cursrole);
+static void free_placeholder(struct config_string *pHolder);
 static bool validate_option_array_item(const char *name, const char *value,
 									   bool skipIfNoPermissions);
 static void write_auto_conf_file(int fd, const char *filename, ConfigVariable *head);
@@ -4722,8 +4723,13 @@ AlterSystemSetConfigFile(AlterSystemStmt *altersysstmt)
 			 * the config file cannot cause postmaster start to fail, so we
 			 * don't have to be too tense about possibly installing a bad
 			 * value.)
+			 *
+			 * As an exception, we skip this check if this is a RESET command
+			 * for an unknown custom GUC, else there'd be no way for users to
+			 * remove such settings with reserved prefixes.
 			 */
-			(void) assignable_custom_variable_name(name, false, ERROR);
+			if (value || !valid_custom_variable_name(name))
+				(void) assignable_custom_variable_name(name, false, ERROR);
 		}
 
 		/*
@@ -5018,16 +5024,8 @@ define_custom_variable(struct config_generic *variable)
 		set_config_sourcefile(name, pHolder->gen.sourcefile,
 							  pHolder->gen.sourceline);
 
-	/*
-	 * Free up as much as we conveniently can of the placeholder structure.
-	 * (This neglects any stack items, so it's possible for some memory to be
-	 * leaked.  Since this can only happen once per session per variable, it
-	 * doesn't seem worth spending much code on.)
-	 */
-	set_string_field(pHolder, pHolder->variable, NULL);
-	set_string_field(pHolder, &pHolder->reset_val, NULL);
-
-	guc_free(pHolder);
+	/* Now we can free the no-longer-referenced placeholder variable */
+	free_placeholder(pHolder);
 }
 
 /*
@@ -5127,6 +5125,25 @@ reapply_stacked_values(struct config_generic *variable,
 }
 
 /*
+ * Free up a no-longer-referenced placeholder GUC variable.
+ *
+ * This neglects any stack items, so it's possible for some memory to be
+ * leaked.  Since this can only happen once per session per variable, it
+ * doesn't seem worth spending much code on.
+ */
+static void
+free_placeholder(struct config_string *pHolder)
+{
+	/* Placeholders are always STRING type, so free their values */
+	Assert(pHolder->gen.vartype == PGC_STRING);
+	set_string_field(pHolder, pHolder->variable, NULL);
+	set_string_field(pHolder, &pHolder->reset_val, NULL);
+
+	guc_free(unconstify(char *, pHolder->gen.name));
+	guc_free(pHolder);
+}
+
+/*
  * Functions for extensions to call to define their custom GUC variables.
  */
 void
@@ -5286,9 +5303,7 @@ MarkGUCPrefixReserved(const char *className)
 
 	/*
 	 * Check for existing placeholders.  We must actually remove invalid
-	 * placeholders, else future parallel worker startups will fail.  (We
-	 * don't bother trying to free associated memory, since this shouldn't
-	 * happen often.)
+	 * placeholders, else future parallel worker startups will fail.
 	 */
 	hash_seq_init(&status, guc_hashtab);
 	while ((hentry = (GUCHashEntry *) hash_seq_search(&status)) != NULL)
@@ -5312,6 +5327,8 @@ MarkGUCPrefixReserved(const char *className)
 						NULL);
 			/* Remove it from any lists it's in, too */
 			RemoveGUCFromLists(var);
+			/* And free it */
+			free_placeholder((struct config_string *) var);
 		}
 	}
 
@@ -6711,6 +6728,7 @@ validate_option_array_item(const char *name, const char *value,
 
 {
 	struct config_generic *gconf;
+	bool		reset_custom;
 
 	/*
 	 * There are three cases to consider:
@@ -6729,16 +6747,21 @@ validate_option_array_item(const char *name, const char *value,
 	 * it's assumed to be fully validated.)
 	 *
 	 * name is not known and can't be created as a placeholder.  Throw error,
-	 * unless skipIfNoPermissions is true, in which case return false.
+	 * unless skipIfNoPermissions or reset_custom is true.  If reset_custom is
+	 * true, this is a RESET or RESET ALL operation for an unknown custom GUC
+	 * with a reserved prefix, in which case we want to fall through to the
+	 * placeholder case described in the preceding paragraph (else there'd be
+	 * no way for users to remove them).  Otherwise, return false.
 	 */
-	gconf = find_option(name, true, skipIfNoPermissions, ERROR);
-	if (!gconf)
+	reset_custom = (!value && valid_custom_variable_name(name));
+	gconf = find_option(name, true, skipIfNoPermissions || reset_custom, ERROR);
+	if (!gconf && !reset_custom)
 	{
 		/* not known, failed to make a placeholder */
 		return false;
 	}
 
-	if (gconf->flags & GUC_CUSTOM_PLACEHOLDER)
+	if (!gconf || gconf->flags & GUC_CUSTOM_PLACEHOLDER)
 	{
 		/*
 		 * We cannot do any meaningful check on the value, so only permissions
diff --git a/src/backend/utils/misc/ps_status.c b/src/backend/utils/misc/ps_status.c
index e08b26e8c14..4df25944deb 100644
--- a/src/backend/utils/misc/ps_status.c
+++ b/src/backend/utils/misc/ps_status.c
@@ -100,6 +100,17 @@ static void flush_ps_display(void);
 static int	save_argc;
 static char **save_argv;
 
+/*
+ * Valgrind seems not to consider the global "environ" variable as a valid
+ * root pointer; so when we allocate a new environment array, it claims that
+ * data is leaked.  To fix that, keep our own statically-allocated copy of the
+ * pointer.  (Oddly, this doesn't seem to be a problem for "argv".)
+ */
+#if defined(PS_USE_CLOBBER_ARGV) && defined(USE_VALGRIND)
+extern char **ps_status_new_environ;
+char	  **ps_status_new_environ;
+#endif
+
 
 /*
  * Call this early in startup to save the original argc/argv values.
@@ -206,6 +217,11 @@ save_ps_display_args(int argc, char **argv)
 		}
 		new_environ[i] = NULL;
 		environ = new_environ;
+
+		/* See notes about Valgrind above. */
+#ifdef USE_VALGRIND
+		ps_status_new_environ = new_environ;
+#endif
 	}
 
 	/*
diff --git a/src/backend/utils/mmgr/alignedalloc.c b/src/backend/utils/mmgr/alignedalloc.c
index 7eea695de62..b1be7426914 100644
--- a/src/backend/utils/mmgr/alignedalloc.c
+++ b/src/backend/utils/mmgr/alignedalloc.c
@@ -45,6 +45,15 @@ AlignedAllocFree(void *pointer)
 			 GetMemoryChunkContext(unaligned)->name, chunk);
 #endif
 
+	/*
+	 * Create a dummy vchunk covering the start of the unaligned chunk, but
+	 * not overlapping the aligned chunk.  This will be freed while pfree'ing
+	 * the unaligned chunk, keeping Valgrind happy.  Then when we return to
+	 * the outer pfree, that will clean up the vchunk for the aligned chunk.
+	 */
+	VALGRIND_MEMPOOL_ALLOC(GetMemoryChunkContext(unaligned), unaligned,
+						   (char *) pointer - (char *) unaligned);
+
 	/* Recursively pfree the unaligned chunk */
 	pfree(unaligned);
 }
@@ -123,6 +132,15 @@ AlignedAllocRealloc(void *pointer, Size size, int flags)
 	VALGRIND_MAKE_MEM_DEFINED(pointer, old_size);
 	memcpy(newptr, pointer, Min(size, old_size));
 
+	/*
+	 * Create a dummy vchunk covering the start of the old unaligned chunk,
+	 * but not overlapping the aligned chunk.  This will be freed while
+	 * pfree'ing the old unaligned chunk, keeping Valgrind happy.  Then when
+	 * we return to repalloc, it will move the vchunk for the aligned chunk.
+	 */
+	VALGRIND_MEMPOOL_ALLOC(ctx, unaligned,
+						   (char *) pointer - (char *) unaligned);
+
 	pfree(unaligned);
 
 	return newptr;
diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c
index 666ecd8f78d..9ef109ca586 100644
--- a/src/backend/utils/mmgr/aset.c
+++ b/src/backend/utils/mmgr/aset.c
@@ -103,6 +103,8 @@
 
 #define ALLOC_BLOCKHDRSZ	MAXALIGN(sizeof(AllocBlockData))
 #define ALLOC_CHUNKHDRSZ	sizeof(MemoryChunk)
+#define FIRST_BLOCKHDRSZ	(MAXALIGN(sizeof(AllocSetContext)) + \
+							 ALLOC_BLOCKHDRSZ)
 
 typedef struct AllocBlockData *AllocBlock;	/* forward reference */
 
@@ -458,6 +460,21 @@ AllocSetContextCreateInternal(MemoryContext parent,
 	 * we'd leak the header/initial block if we ereport in this stretch.
 	 */
 
+	/* Create a vpool associated with the context */
+	VALGRIND_CREATE_MEMPOOL(set, 0, false);
+
+	/*
+	 * Create a vchunk covering both the AllocSetContext struct and the keeper
+	 * block's header.  (Perhaps it would be more sensible for these to be two
+	 * separate vchunks, but doing that seems to tickle bugs in some versions
+	 * of Valgrind.)  We must have these vchunks, and also a vchunk for each
+	 * subsequently-added block header, so that Valgrind considers the
+	 * pointers within them while checking for leaked memory.  Note that
+	 * Valgrind doesn't distinguish between these vchunks and those created by
+	 * mcxt.c for the user-accessible-data chunks we allocate.
+	 */
+	VALGRIND_MEMPOOL_ALLOC(set, set, FIRST_BLOCKHDRSZ);
+
 	/* Fill in the initial block's block header */
 	block = KeeperBlock(set);
 	block->aset = set;
@@ -585,6 +602,14 @@ AllocSetReset(MemoryContext context)
 #ifdef CLOBBER_FREED_MEMORY
 			wipe_mem(block, block->freeptr - ((char *) block));
 #endif
+
+			/*
+			 * We need to free the block header's vchunk explicitly, although
+			 * the user-data vchunks within will go away in the TRIM below.
+			 * Otherwise Valgrind complains about leaked allocations.
+			 */
+			VALGRIND_MEMPOOL_FREE(set, block);
+
 			free(block);
 		}
 		block = next;
@@ -592,6 +617,14 @@ AllocSetReset(MemoryContext context)
 
 	Assert(context->mem_allocated == keepersize);
 
+	/*
+	 * Instruct Valgrind to throw away all the vchunks associated with this
+	 * context, except for the one covering the AllocSetContext and
+	 * keeper-block header.  This gets rid of the vchunks for whatever user
+	 * data is getting discarded by the context reset.
+	 */
+	VALGRIND_MEMPOOL_TRIM(set, set, FIRST_BLOCKHDRSZ);
+
 	/* Reset block size allocation sequence, too */
 	set->nextBlockSize = set->initBlockSize;
 }
@@ -648,6 +681,9 @@ AllocSetDelete(MemoryContext context)
 				freelist->first_free = (AllocSetContext *) oldset->header.nextchild;
 				freelist->num_free--;
 
+				/* Destroy the context's vpool --- see notes below */
+				VALGRIND_DESTROY_MEMPOOL(oldset);
+
 				/* All that remains is to free the header/initial block */
 				free(oldset);
 			}
@@ -675,13 +711,24 @@ AllocSetDelete(MemoryContext context)
 #endif
 
 		if (!IsKeeperBlock(set, block))
+		{
+			/* As in AllocSetReset, free block-header vchunks explicitly */
+			VALGRIND_MEMPOOL_FREE(set, block);
 			free(block);
+		}
 
 		block = next;
 	}
 
 	Assert(context->mem_allocated == keepersize);
 
+	/*
+	 * Destroy the vpool.  We don't seem to need to explicitly free the
+	 * initial block's header vchunk, nor any user-data vchunks that Valgrind
+	 * still knows about; they'll all go away automatically.
+	 */
+	VALGRIND_DESTROY_MEMPOOL(set);
+
 	/* Finally, free the context header, including the keeper block */
 	free(set);
 }
@@ -716,6 +763,9 @@ AllocSetAllocLarge(MemoryContext context, Size size, int flags)
 	if (block == NULL)
 		return MemoryContextAllocationFailure(context, size, flags);
 
+	/* Make a vchunk covering the new block's header */
+	VALGRIND_MEMPOOL_ALLOC(set, block, ALLOC_BLOCKHDRSZ);
+
 	context->mem_allocated += blksize;
 
 	block->aset = set;
@@ -922,6 +972,9 @@ AllocSetAllocFromNewBlock(MemoryContext context, Size size, int flags,
 	if (block == NULL)
 		return MemoryContextAllocationFailure(context, size, flags);
 
+	/* Make a vchunk covering the new block's header */
+	VALGRIND_MEMPOOL_ALLOC(set, block, ALLOC_BLOCKHDRSZ);
+
 	context->mem_allocated += blksize;
 
 	block->aset = set;
@@ -1104,6 +1157,10 @@ AllocSetFree(void *pointer)
 #ifdef CLOBBER_FREED_MEMORY
 		wipe_mem(block, block->freeptr - ((char *) block));
 #endif
+
+		/* As in AllocSetReset, free block-header vchunks explicitly */
+		VALGRIND_MEMPOOL_FREE(set, block);
+
 		free(block);
 	}
 	else
@@ -1184,6 +1241,7 @@ AllocSetRealloc(void *pointer, Size size, int flags)
 		 * realloc() to make the containing block bigger, or smaller, with
 		 * minimum space wastage.
 		 */
+		AllocBlock	newblock;
 		Size		chksize;
 		Size		blksize;
 		Size		oldblksize;
@@ -1223,14 +1281,21 @@ AllocSetRealloc(void *pointer, Size size, int flags)
 		blksize = chksize + ALLOC_BLOCKHDRSZ + ALLOC_CHUNKHDRSZ;
 		oldblksize = block->endptr - ((char *) block);
 
-		block = (AllocBlock) realloc(block, blksize);
-		if (block == NULL)
+		newblock = (AllocBlock) realloc(block, blksize);
+		if (newblock == NULL)
 		{
 			/* Disallow access to the chunk header. */
 			VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ);
 			return MemoryContextAllocationFailure(&set->header, size, flags);
 		}
 
+		/*
+		 * Move the block-header vchunk explicitly.  (mcxt.c will take care of
+		 * moving the vchunk for the user data.)
+		 */
+		VALGRIND_MEMPOOL_CHANGE(set, block, newblock, ALLOC_BLOCKHDRSZ);
+		block = newblock;
+
 		/* updated separately, not to underflow when (oldblksize > blksize) */
 		set->header.mem_allocated -= oldblksize;
 		set->header.mem_allocated += blksize;
@@ -1294,7 +1359,7 @@ AllocSetRealloc(void *pointer, Size size, int flags)
 		/* Ensure any padding bytes are marked NOACCESS. */
 		VALGRIND_MAKE_MEM_NOACCESS((char *) pointer + size, chksize - size);
 
-		/* Disallow access to the chunk header . */
+		/* Disallow access to the chunk header. */
 		VALGRIND_MAKE_MEM_NOACCESS(chunk, ALLOC_CHUNKHDRSZ);
 
 		return pointer;
diff --git a/src/backend/utils/mmgr/bump.c b/src/backend/utils/mmgr/bump.c
index f7a37d1b3e8..2805d55a2ec 100644
--- a/src/backend/utils/mmgr/bump.c
+++ b/src/backend/utils/mmgr/bump.c
@@ -45,7 +45,9 @@
 #include "utils/memutils_memorychunk.h"
 #include "utils/memutils_internal.h"
 
-#define Bump_BLOCKHDRSZ	MAXALIGN(sizeof(BumpBlock))
+#define Bump_BLOCKHDRSZ		MAXALIGN(sizeof(BumpBlock))
+#define FIRST_BLOCKHDRSZ	(MAXALIGN(sizeof(BumpContext)) + \
+							 Bump_BLOCKHDRSZ)
 
 /* No chunk header unless built with MEMORY_CONTEXT_CHECKING */
 #ifdef MEMORY_CONTEXT_CHECKING
@@ -189,6 +191,12 @@ BumpContextCreate(MemoryContext parent, const char *name, Size minContextSize,
 	 * Avoid writing code that can fail between here and MemoryContextCreate;
 	 * we'd leak the header and initial block if we ereport in this stretch.
 	 */
+
+	/* See comments about Valgrind interactions in aset.c */
+	VALGRIND_CREATE_MEMPOOL(set, 0, false);
+	/* This vchunk covers the BumpContext and the keeper block header */
+	VALGRIND_MEMPOOL_ALLOC(set, set, FIRST_BLOCKHDRSZ);
+
 	dlist_init(&set->blocks);
 
 	/* Fill in the initial block's block header */
@@ -262,6 +270,14 @@ BumpReset(MemoryContext context)
 			BumpBlockFree(set, block);
 	}
 
+	/*
+	 * Instruct Valgrind to throw away all the vchunks associated with this
+	 * context, except for the one covering the BumpContext and keeper-block
+	 * header.  This gets rid of the vchunks for whatever user data is getting
+	 * discarded by the context reset.
+	 */
+	VALGRIND_MEMPOOL_TRIM(set, set, FIRST_BLOCKHDRSZ);
+
 	/* Reset block size allocation sequence, too */
 	set->nextBlockSize = set->initBlockSize;
 
@@ -279,6 +295,10 @@ BumpDelete(MemoryContext context)
 {
 	/* Reset to release all releasable BumpBlocks */
 	BumpReset(context);
+
+	/* Destroy the vpool -- see notes in aset.c */
+	VALGRIND_DESTROY_MEMPOOL(context);
+
 	/* And free the context header and keeper block */
 	free(context);
 }
@@ -318,6 +338,9 @@ BumpAllocLarge(MemoryContext context, Size size, int flags)
 	if (block == NULL)
 		return MemoryContextAllocationFailure(context, size, flags);
 
+	/* Make a vchunk covering the new block's header */
+	VALGRIND_MEMPOOL_ALLOC(set, block, Bump_BLOCKHDRSZ);
+
 	context->mem_allocated += blksize;
 
 	/* the block is completely full */
@@ -455,6 +478,9 @@ BumpAllocFromNewBlock(MemoryContext context, Size size, int flags,
 	if (block == NULL)
 		return MemoryContextAllocationFailure(context, size, flags);
 
+	/* Make a vchunk covering the new block's header */
+	VALGRIND_MEMPOOL_ALLOC(set, block, Bump_BLOCKHDRSZ);
+
 	context->mem_allocated += blksize;
 
 	/* initialize the new block */
@@ -606,6 +632,9 @@ BumpBlockFree(BumpContext *set, BumpBlock *block)
 	wipe_mem(block, ((char *) block->endptr - (char *) block));
 #endif
 
+	/* As in aset.c, free block-header vchunks explicitly */
+	VALGRIND_MEMPOOL_FREE(set, block);
+
 	free(block);
 }
 
diff --git a/src/backend/utils/mmgr/generation.c b/src/backend/utils/mmgr/generation.c
index 18679ad4f1e..cfafc9bf082 100644
--- a/src/backend/utils/mmgr/generation.c
+++ b/src/backend/utils/mmgr/generation.c
@@ -45,6 +45,8 @@
 
 #define Generation_BLOCKHDRSZ	MAXALIGN(sizeof(GenerationBlock))
 #define Generation_CHUNKHDRSZ	sizeof(MemoryChunk)
+#define FIRST_BLOCKHDRSZ		(MAXALIGN(sizeof(GenerationContext)) + \
+								 Generation_BLOCKHDRSZ)
 
 #define Generation_CHUNK_FRACTION	8
 
@@ -221,6 +223,12 @@ GenerationContextCreate(MemoryContext parent,
 	 * Avoid writing code that can fail between here and MemoryContextCreate;
 	 * we'd leak the header if we ereport in this stretch.
 	 */
+
+	/* See comments about Valgrind interactions in aset.c */
+	VALGRIND_CREATE_MEMPOOL(set, 0, false);
+	/* This vchunk covers the GenerationContext and the keeper block header */
+	VALGRIND_MEMPOOL_ALLOC(set, set, FIRST_BLOCKHDRSZ);
+
 	dlist_init(&set->blocks);
 
 	/* Fill in the initial block's block header */
@@ -309,6 +317,14 @@ GenerationReset(MemoryContext context)
 			GenerationBlockFree(set, block);
 	}
 
+	/*
+	 * Instruct Valgrind to throw away all the vchunks associated with this
+	 * context, except for the one covering the GenerationContext and
+	 * keeper-block header.  This gets rid of the vchunks for whatever user
+	 * data is getting discarded by the context reset.
+	 */
+	VALGRIND_MEMPOOL_TRIM(set, set, FIRST_BLOCKHDRSZ);
+
 	/* set it so new allocations to make use of the keeper block */
 	set->block = KeeperBlock(set);
 
@@ -329,6 +345,10 @@ GenerationDelete(MemoryContext context)
 {
 	/* Reset to release all releasable GenerationBlocks */
 	GenerationReset(context);
+
+	/* Destroy the vpool -- see notes in aset.c */
+	VALGRIND_DESTROY_MEMPOOL(context);
+
 	/* And free the context header and keeper block */
 	free(context);
 }
@@ -365,6 +385,9 @@ GenerationAllocLarge(MemoryContext context, Size size, int flags)
 	if (block == NULL)
 		return MemoryContextAllocationFailure(context, size, flags);
 
+	/* Make a vchunk covering the new block's header */
+	VALGRIND_MEMPOOL_ALLOC(set, block, Generation_BLOCKHDRSZ);
+
 	context->mem_allocated += blksize;
 
 	/* block with a single (used) chunk */
@@ -487,6 +510,9 @@ GenerationAllocFromNewBlock(MemoryContext context, Size size, int flags,
 	if (block == NULL)
 		return MemoryContextAllocationFailure(context, size, flags);
 
+	/* Make a vchunk covering the new block's header */
+	VALGRIND_MEMPOOL_ALLOC(set, block, Generation_BLOCKHDRSZ);
+
 	context->mem_allocated += blksize;
 
 	/* initialize the new block */
@@ -677,6 +703,9 @@ GenerationBlockFree(GenerationContext *set, GenerationBlock *block)
 	wipe_mem(block, block->blksize);
 #endif
 
+	/* As in aset.c, free block-header vchunks explicitly */
+	VALGRIND_MEMPOOL_FREE(set, block);
+
 	free(block);
 }
 
diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c
index 15fa4d0a55e..47fd774c7d2 100644
--- a/src/backend/utils/mmgr/mcxt.c
+++ b/src/backend/utils/mmgr/mcxt.c
@@ -8,6 +8,23 @@
  * context-type-specific operations via the function pointers in a
  * context's MemoryContextMethods struct.
  *
+ * A note about Valgrind support: when USE_VALGRIND is defined, we provide
+ * support for memory leak tracking at the allocation-unit level.  Valgrind
+ * does leak detection by tracking allocated "chunks", which can be grouped
+ * into "pools".  The "chunk" terminology is overloaded, since we use that
+ * word for our allocation units, and it's sometimes important to distinguish
+ * those from the Valgrind objects that describe them.  To reduce confusion,
+ * let's use the terms "vchunk" and "vpool" for the Valgrind objects.
+ *
+ * We use a separate vpool for each memory context.  The context-type-specific
+ * code is responsible for creating and deleting the vpools, and also for
+ * creating vchunks to cover its management data structures such as block
+ * headers.  (There must be a vchunk that includes every pointer we want
+ * Valgrind to consider for leak-tracking purposes.)  This module creates
+ * and deletes the vchunks that cover the caller-visible allocated chunks.
+ * However, the context-type-specific code must handle cleaning up those
+ * vchunks too during memory context reset operations.
+ *
  *
  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -418,8 +435,6 @@ MemoryContextResetOnly(MemoryContext context)
 
 		context->methods->reset(context);
 		context->isReset = true;
-		VALGRIND_DESTROY_MEMPOOL(context);
-		VALGRIND_CREATE_MEMPOOL(context, 0, false);
 	}
 }
 
@@ -526,8 +541,6 @@ MemoryContextDeleteOnly(MemoryContext context)
 	context->ident = NULL;
 
 	context->methods->delete_context(context);
-
-	VALGRIND_DESTROY_MEMPOOL(context);
 }
 
 /*
@@ -560,9 +573,7 @@ MemoryContextDeleteChildren(MemoryContext context)
  * the specified context, since that means it will automatically be freed
  * when no longer needed.
  *
- * There is no API for deregistering a callback once registered.  If you
- * want it to not do anything anymore, adjust the state pointed to by its
- * "arg" to indicate that.
+ * Note that callers can assume this cannot fail.
  */
 void
 MemoryContextRegisterResetCallback(MemoryContext context,
@@ -578,6 +589,41 @@ MemoryContextRegisterResetCallback(MemoryContext context,
 }
 
 /*
+ * MemoryContextUnregisterResetCallback
+ *		Undo the effects of MemoryContextRegisterResetCallback.
+ *
+ * This can be used if a callback's effects are no longer required
+ * at some point before the context has been reset/deleted.  It is the
+ * caller's responsibility to pfree the callback struct (if needed).
+ *
+ * An assertion failure occurs if the callback was not registered.
+ * We could alternatively define that case as a no-op, but that seems too
+ * likely to mask programming errors such as passing the wrong context.
+ */
+void
+MemoryContextUnregisterResetCallback(MemoryContext context,
+									 MemoryContextCallback *cb)
+{
+	MemoryContextCallback *prev,
+			   *cur;
+
+	Assert(MemoryContextIsValid(context));
+
+	for (prev = NULL, cur = context->reset_cbs; cur != NULL;
+		 prev = cur, cur = cur->next)
+	{
+		if (cur != cb)
+			continue;
+		if (prev)
+			prev->next = cur->next;
+		else
+			context->reset_cbs = cur->next;
+		return;
+	}
+	Assert(false);
+}
+
+/*
  * MemoryContextCallResetCallbacks
  *		Internal function to call all registered callbacks for context.
  */
@@ -1137,8 +1183,6 @@ MemoryContextCreate(MemoryContext node,
 		node->nextchild = NULL;
 		node->allowInCritSection = false;
 	}
-
-	VALGRIND_CREATE_MEMPOOL(node, 0, false);
 }
 
 /*
@@ -1421,7 +1465,13 @@ MemoryContextAllocAligned(MemoryContext context,
 	void	   *unaligned;
 	void	   *aligned;
 
-	/* wouldn't make much sense to waste that much space */
+	/*
+	 * Restrict alignto to ensure that it can fit into the "value" field of
+	 * the redirection MemoryChunk, and that the distance back to the start of
+	 * the unaligned chunk will fit into the space available for that.  This
+	 * isn't a limitation in practice, since it wouldn't make much sense to
+	 * waste that much space.
+	 */
 	Assert(alignto < (128 * 1024 * 1024));
 
 	/* ensure alignto is a power of 2 */
@@ -1458,10 +1508,15 @@ MemoryContextAllocAligned(MemoryContext context,
 	alloc_size += 1;
 #endif
 
-	/* perform the actual allocation */
-	unaligned = MemoryContextAllocExtended(context, alloc_size, flags);
+	/*
+	 * Perform the actual allocation, but do not pass down MCXT_ALLOC_ZERO.
+	 * This ensures that wasted bytes beyond the aligned chunk do not become
+	 * DEFINED.
+	 */
+	unaligned = MemoryContextAllocExtended(context, alloc_size,
+										   flags & ~MCXT_ALLOC_ZERO);
 
-	/* set the aligned pointer */
+	/* compute the aligned pointer */
 	aligned = (void *) TYPEALIGN(alignto, (char *) unaligned +
 								 sizeof(MemoryChunk));
 
@@ -1489,12 +1544,23 @@ MemoryContextAllocAligned(MemoryContext context,
 	set_sentinel(aligned, size);
 #endif
 
-	/* Mark the bytes before the redirection header as noaccess */
-	VALGRIND_MAKE_MEM_NOACCESS(unaligned,
-							   (char *) alignedchunk - (char *) unaligned);
+	/*
+	 * MemoryContextAllocExtended marked the whole unaligned chunk as a
+	 * vchunk.  Undo that, instead making just the aligned chunk be a vchunk.
+	 * This prevents Valgrind from complaining that the vchunk is possibly
+	 * leaked, since only pointers to the aligned chunk will exist.
+	 *
+	 * After these calls, the aligned chunk will be marked UNDEFINED, and all
+	 * the rest of the unaligned chunk (the redirection chunk header, the
+	 * padding bytes before it, and any wasted trailing bytes) will be marked
+	 * NOACCESS, which is what we want.
+	 */
+	VALGRIND_MEMPOOL_FREE(context, unaligned);
+	VALGRIND_MEMPOOL_ALLOC(context, aligned, size);
 
-	/* Disallow access to the redirection chunk header. */
-	VALGRIND_MAKE_MEM_NOACCESS(alignedchunk, sizeof(MemoryChunk));
+	/* Now zero (and make DEFINED) just the aligned chunk, if requested */
+	if ((flags & MCXT_ALLOC_ZERO) != 0)
+		MemSetAligned(aligned, 0, size);
 
 	return aligned;
 }
@@ -1528,16 +1594,12 @@ void
 pfree(void *pointer)
 {
 #ifdef USE_VALGRIND
-	MemoryContextMethodID method = GetMemoryChunkMethodID(pointer);
 	MemoryContext context = GetMemoryChunkContext(pointer);
 #endif
 
 	MCXT_METHOD(pointer, free_p) (pointer);
 
-#ifdef USE_VALGRIND
-	if (method != MCTX_ALIGNED_REDIRECT_ID)
-		VALGRIND_MEMPOOL_FREE(context, pointer);
-#endif
+	VALGRIND_MEMPOOL_FREE(context, pointer);
 }
 
 /*
@@ -1547,9 +1609,6 @@ pfree(void *pointer)
 void *
 repalloc(void *pointer, Size size)
 {
-#ifdef USE_VALGRIND
-	MemoryContextMethodID method = GetMemoryChunkMethodID(pointer);
-#endif
 #if defined(USE_ASSERT_CHECKING) || defined(USE_VALGRIND)
 	MemoryContext context = GetMemoryChunkContext(pointer);
 #endif
@@ -1572,10 +1631,7 @@ repalloc(void *pointer, Size size)
 	 */
 	ret = MCXT_METHOD(pointer, realloc) (pointer, size, 0);
 
-#ifdef USE_VALGRIND
-	if (method != MCTX_ALIGNED_REDIRECT_ID)
-		VALGRIND_MEMPOOL_CHANGE(context, pointer, ret, size);
-#endif
+	VALGRIND_MEMPOOL_CHANGE(context, pointer, ret, size);
 
 	return ret;
 }
diff --git a/src/backend/utils/mmgr/slab.c b/src/backend/utils/mmgr/slab.c
index d32c0d318fb..0e35abcf5a0 100644
--- a/src/backend/utils/mmgr/slab.c
+++ b/src/backend/utils/mmgr/slab.c
@@ -377,6 +377,11 @@ SlabContextCreate(MemoryContext parent,
 	 * we'd leak the header if we ereport in this stretch.
 	 */
 
+	/* See comments about Valgrind interactions in aset.c */
+	VALGRIND_CREATE_MEMPOOL(slab, 0, false);
+	/* This vchunk covers the SlabContext only */
+	VALGRIND_MEMPOOL_ALLOC(slab, slab, sizeof(SlabContext));
+
 	/* Fill in SlabContext-specific header fields */
 	slab->chunkSize = (uint32) chunkSize;
 	slab->fullChunkSize = (uint32) fullChunkSize;
@@ -451,6 +456,10 @@ SlabReset(MemoryContext context)
 #ifdef CLOBBER_FREED_MEMORY
 		wipe_mem(block, slab->blockSize);
 #endif
+
+		/* As in aset.c, free block-header vchunks explicitly */
+		VALGRIND_MEMPOOL_FREE(slab, block);
+
 		free(block);
 		context->mem_allocated -= slab->blockSize;
 	}
@@ -467,11 +476,23 @@ SlabReset(MemoryContext context)
 #ifdef CLOBBER_FREED_MEMORY
 			wipe_mem(block, slab->blockSize);
 #endif
+
+			/* As in aset.c, free block-header vchunks explicitly */
+			VALGRIND_MEMPOOL_FREE(slab, block);
+
 			free(block);
 			context->mem_allocated -= slab->blockSize;
 		}
 	}
 
+	/*
+	 * Instruct Valgrind to throw away all the vchunks associated with this
+	 * context, except for the one covering the SlabContext.  This gets rid of
+	 * the vchunks for whatever user data is getting discarded by the context
+	 * reset.
+	 */
+	VALGRIND_MEMPOOL_TRIM(slab, slab, sizeof(SlabContext));
+
 	slab->curBlocklistIndex = 0;
 
 	Assert(context->mem_allocated == 0);
@@ -486,6 +507,10 @@ SlabDelete(MemoryContext context)
 {
 	/* Reset to release all the SlabBlocks */
 	SlabReset(context);
+
+	/* Destroy the vpool -- see notes in aset.c */
+	VALGRIND_DESTROY_MEMPOOL(context);
+
 	/* And free the context header */
 	free(context);
 }
@@ -567,6 +592,9 @@ SlabAllocFromNewBlock(MemoryContext context, Size size, int flags)
 		if (unlikely(block == NULL))
 			return MemoryContextAllocationFailure(context, size, flags);
 
+		/* Make a vchunk covering the new block's header */
+		VALGRIND_MEMPOOL_ALLOC(slab, block, Slab_BLOCKHDRSZ);
+
 		block->slab = slab;
 		context->mem_allocated += slab->blockSize;
 
@@ -795,6 +823,10 @@ SlabFree(void *pointer)
 #ifdef CLOBBER_FREED_MEMORY
 			wipe_mem(block, slab->blockSize);
 #endif
+
+			/* As in aset.c, free block-header vchunks explicitly */
+			VALGRIND_MEMPOOL_FREE(slab, block);
+
 			free(block);
 			slab->header.mem_allocated -= slab->blockSize;
 		}