8 files changed, 311 insertions, 107 deletions
diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c
index b78048328e1..0a8c054162f 100644
--- a/src/backend/storage/aio/method_io_uring.c
+++ b/src/backend/storage/aio/method_io_uring.c
@@ -29,6 +29,9 @@
 
 #ifdef IOMETHOD_IO_URING_ENABLED
 
+#include <sys/mman.h>
+#include <unistd.h>
+
 #include <liburing.h>
 
 #include "miscadmin.h"
@@ -94,12 +97,32 @@ PgAioUringContext
 	struct io_uring io_uring_ring;
 } PgAioUringContext;
 
+/*
+ * Information about the capabilities that io_uring has.
+ *
+ * Depending on liburing and kernel version different features are
+ * supported. At least for the kernel a kernel version check does not suffice
+ * as various vendors do backport features to older kernels :(.
+ */
+typedef struct PgAioUringCaps
+{
+	bool		checked;
+	/* -1 if io_uring_queue_init_mem() is unsupported */
+	int			mem_init_size;
+} PgAioUringCaps;
+
+
 /* PgAioUringContexts for all backends */
 static PgAioUringContext *pgaio_uring_contexts;
 
 /* the current backend's context */
 static PgAioUringContext *pgaio_my_uring_context;
 
+static PgAioUringCaps pgaio_uring_caps =
+{
+	.checked = false,
+	.mem_init_size = -1,
+};
 
 static uint32
 pgaio_uring_procs(void)
@@ -111,16 +134,145 @@ pgaio_uring_procs(void)
 	return MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS;
 }
 
-static Size
+/*
+ * Initializes pgaio_uring_caps, unless that's already done.
+ */
+static void
+pgaio_uring_check_capabilities(void)
+{
+	if (pgaio_uring_caps.checked)
+		return;
+
+	/*
+	 * By default io_uring creates a shared memory mapping for each io_uring
+	 * instance, leading to a large number of memory mappings. Unfortunately a
+	 * large number of memory mappings slows things down, backend exit is
+	 * particularly affected.  To address that, newer kernels (6.5) support
+	 * using user-provided memory for the memory, by putting the relevant
+	 * memory into shared memory we don't need any additional mappings.
+	 *
+	 * To know whether this is supported, we unfortunately need to probe the
+	 * kernel by trying to create a ring with userspace-provided memory. This
+	 * also has a secondary benefit: We can determine precisely how much
+	 * memory we need for each io_uring instance.
+	 */
+#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
+	{
+		struct io_uring test_ring;
+		size_t		ring_size;
+		void	   *ring_ptr;
+		struct io_uring_params p = {0};
+		int			ret;
+
+		/*
+		 * Liburing does not yet provide an API to query how much memory a
+		 * ring will need. So we over-estimate it here. As the memory is freed
+		 * just below that's small temporary waste of memory.
+		 *
+		 * 1MB is more than enough for rings within io_max_concurrency's
+		 * range.
+		 */
+		ring_size = 1024 * 1024;
+
+		/*
+		 * Hard to believe a system exists where 1MB would not be a multiple
+		 * of the page size. But it's cheap to ensure...
+		 */
+		ring_size -= ring_size % sysconf(_SC_PAGESIZE);
+
+		ring_ptr = mmap(NULL, ring_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+		if (ring_ptr == MAP_FAILED)
+			elog(ERROR,
+				 "mmap(%zu) to determine io_uring_queue_init_mem() support failed: %m",
+				 ring_size);
+
+		ret = io_uring_queue_init_mem(io_max_concurrency, &test_ring, &p, ring_ptr, ring_size);
+		if (ret > 0)
+		{
+			pgaio_uring_caps.mem_init_size = ret;
+
+			elog(DEBUG1,
+				 "can use combined memory mapping for io_uring, each ring needs %d bytes",
+				 ret);
+
+			/* clean up the created ring, it was just for a test */
+			io_uring_queue_exit(&test_ring);
+		}
+		else
+		{
+			/*
+			 * There are different reasons for ring creation to fail, but it's
+			 * ok to treat that just as io_uring_queue_init_mem() not being
+			 * supported. We'll report a more detailed error in
+			 * pgaio_uring_shmem_init().
+			 */
+			errno = -ret;
+			elog(DEBUG1,
+				 "cannot use combined memory mapping for io_uring, ring creation failed: %m");
+
+		}
+
+		if (munmap(ring_ptr, ring_size) != 0)
+			elog(ERROR, "munmap() failed: %m");
+	}
+#else
+	{
+		elog(DEBUG1,
+			 "can't use combined memory mapping for io_uring, kernel or liburing too old");
+	}
+#endif
+
+	pgaio_uring_caps.checked = true;
+}
+
+/*
+ * Memory for all PgAioUringContext instances
+ */
+static size_t
 pgaio_uring_context_shmem_size(void)
 {
 	return mul_size(pgaio_uring_procs(), sizeof(PgAioUringContext));
 }
 
+/*
+ * Memory for the combined memory used by io_uring instances. Returns 0 if
+ * that is not supported by kernel/liburing.
+ */
+static size_t
+pgaio_uring_ring_shmem_size(void)
+{
+	size_t		sz = 0;
+
+	if (pgaio_uring_caps.mem_init_size > 0)
+	{
+		/*
+		 * Memory for rings needs to be allocated to the page boundary,
+		 * reserve space. Luckily it does not need to be aligned to hugepage
+		 * boundaries, even if huge pages are used.
+		 */
+		sz = add_size(sz, sysconf(_SC_PAGESIZE));
+		sz = add_size(sz, mul_size(pgaio_uring_procs(),
+								   pgaio_uring_caps.mem_init_size));
+	}
+
+	return sz;
+}
+
 static size_t
 pgaio_uring_shmem_size(void)
 {
-	return pgaio_uring_context_shmem_size();
+	size_t		sz;
+
+	/*
+	 * Kernel and liburing support for various features influences how much
+	 * shmem we need, perform the necessary checks.
+	 */
+	pgaio_uring_check_capabilities();
+
+	sz = pgaio_uring_context_shmem_size();
+	sz = add_size(sz, pgaio_uring_ring_shmem_size());
+
+	return sz;
 }
 
 static void
@@ -128,13 +280,38 @@ pgaio_uring_shmem_init(bool first_time)
 {
 	int			TotalProcs = pgaio_uring_procs();
 	bool		found;
+	char	   *shmem;
+	size_t		ring_mem_remain = 0;
+	char	   *ring_mem_next = 0;
 
-	pgaio_uring_contexts = (PgAioUringContext *)
-		ShmemInitStruct("AioUring", pgaio_uring_shmem_size(), &found);
-
+	/*
+	 * We allocate memory for all PgAioUringContext instances and, if
+	 * supported, the memory required for each of the io_uring instances, in
+	 * one ShmemInitStruct().
+	 */
+	shmem = ShmemInitStruct("AioUringContext", pgaio_uring_shmem_size(), &found);
 	if (found)
 		return;
 
+	pgaio_uring_contexts = (PgAioUringContext *) shmem;
+	shmem += pgaio_uring_context_shmem_size();
+
+	/* if supported, handle memory alignment / sizing for io_uring memory */
+	if (pgaio_uring_caps.mem_init_size > 0)
+	{
+		ring_mem_remain = pgaio_uring_ring_shmem_size();
+		ring_mem_next = (char *) shmem;
+
+		/* align to page boundary, see also pgaio_uring_ring_shmem_size() */
+		ring_mem_next = (char *) TYPEALIGN(sysconf(_SC_PAGESIZE), ring_mem_next);
+
+		/* account for alignment */
+		ring_mem_remain -= ring_mem_next - shmem;
+		shmem += ring_mem_next - shmem;
+
+		shmem += ring_mem_remain;
+	}
+
 	for (int contextno = 0; contextno < TotalProcs; contextno++)
 	{
 		PgAioUringContext *context = &pgaio_uring_contexts[contextno];
@@ -158,7 +335,28 @@ pgaio_uring_shmem_init(bool first_time)
 		 * be worth using that - also need to evaluate if that causes
 		 * noticeable additional contention?
 		 */
-		ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0);
+
+		/*
+		 * If supported (c.f. pgaio_uring_check_capabilities()), create ring
+		 * with its data in shared memory. Otherwise fall back io_uring
+		 * creating a memory mapping for each ring.
+		 */
+#if defined(HAVE_LIBURING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
+		if (pgaio_uring_caps.mem_init_size > 0)
+		{
+			struct io_uring_params p = {0};
+
+			ret = io_uring_queue_init_mem(io_max_concurrency, &context->io_uring_ring, &p, ring_mem_next, ring_mem_remain);
+
+			ring_mem_remain -= ret;
+			ring_mem_next += ret;
+		}
+		else
+#endif
+		{
+			ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0);
+		}
+
 		if (ret < 0)
 		{
 			char	   *hint = NULL;
diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c
index 36be179678d..bf8f77e6ff6 100644
--- a/src/backend/storage/aio/method_worker.c
+++ b/src/backend/storage/aio/method_worker.c
@@ -52,26 +52,26 @@
 #define IO_WORKER_WAKEUP_FANOUT 2
 
 
-typedef struct AioWorkerSubmissionQueue
+typedef struct PgAioWorkerSubmissionQueue
 {
 	uint32		size;
 	uint32		mask;
 	uint32		head;
 	uint32		tail;
-	uint32		ios[FLEXIBLE_ARRAY_MEMBER];
-} AioWorkerSubmissionQueue;
+	uint32		sqes[FLEXIBLE_ARRAY_MEMBER];
+} PgAioWorkerSubmissionQueue;
 
-typedef struct AioWorkerSlot
+typedef struct PgAioWorkerSlot
 {
 	Latch	   *latch;
 	bool		in_use;
-} AioWorkerSlot;
+} PgAioWorkerSlot;
 
-typedef struct AioWorkerControl
+typedef struct PgAioWorkerControl
 {
 	uint64		idle_worker_mask;
-	AioWorkerSlot workers[FLEXIBLE_ARRAY_MEMBER];
-} AioWorkerControl;
+	PgAioWorkerSlot workers[FLEXIBLE_ARRAY_MEMBER];
+} PgAioWorkerControl;
 
 
 static size_t pgaio_worker_shmem_size(void);
@@ -96,8 +96,8 @@ int			io_workers = 3;
 
 static int	io_worker_queue_size = 64;
 static int	MyIoWorkerId;
-static AioWorkerSubmissionQueue *io_worker_submission_queue;
-static AioWorkerControl *io_worker_control;
+static PgAioWorkerSubmissionQueue *io_worker_submission_queue;
+static PgAioWorkerControl *io_worker_control;
 
 
 static size_t
@@ -106,15 +106,15 @@ pgaio_worker_queue_shmem_size(int *queue_size)
 	/* Round size up to next power of two so we can make a mask. */
 	*queue_size = pg_nextpower2_32(io_worker_queue_size);
 
-	return offsetof(AioWorkerSubmissionQueue, ios) +
+	return offsetof(PgAioWorkerSubmissionQueue, sqes) +
 		sizeof(uint32) * *queue_size;
 }
 
 static size_t
 pgaio_worker_control_shmem_size(void)
 {
-	return offsetof(AioWorkerControl, workers) +
-		sizeof(AioWorkerSlot) * MAX_IO_WORKERS;
+	return offsetof(PgAioWorkerControl, workers) +
+		sizeof(PgAioWorkerSlot) * MAX_IO_WORKERS;
 }
 
 static size_t
@@ -162,7 +162,7 @@ pgaio_worker_shmem_init(bool first_time)
 }
 
 static int
-pgaio_choose_idle_worker(void)
+pgaio_worker_choose_idle(void)
 {
 	int			worker;
 
@@ -172,6 +172,7 @@ pgaio_choose_idle_worker(void)
 	/* Find the lowest bit position, and clear it. */
 	worker = pg_rightmost_one_pos64(io_worker_control->idle_worker_mask);
 	io_worker_control->idle_worker_mask &= ~(UINT64_C(1) << worker);
+	Assert(io_worker_control->workers[worker].in_use);
 
 	return worker;
 }
@@ -179,7 +180,7 @@ pgaio_choose_idle_worker(void)
 static bool
 pgaio_worker_submission_queue_insert(PgAioHandle *ioh)
 {
-	AioWorkerSubmissionQueue *queue;
+	PgAioWorkerSubmissionQueue *queue;
 	uint32		new_head;
 
 	queue = io_worker_submission_queue;
@@ -191,7 +192,7 @@ pgaio_worker_submission_queue_insert(PgAioHandle *ioh)
 		return false;			/* full */
 	}
 
-	queue->ios[queue->head] = pgaio_io_get_id(ioh);
+	queue->sqes[queue->head] = pgaio_io_get_id(ioh);
 	queue->head = new_head;
 
 	return true;
@@ -200,14 +201,14 @@ pgaio_worker_submission_queue_insert(PgAioHandle *ioh)
 static uint32
 pgaio_worker_submission_queue_consume(void)
 {
-	AioWorkerSubmissionQueue *queue;
+	PgAioWorkerSubmissionQueue *queue;
 	uint32		result;
 
 	queue = io_worker_submission_queue;
 	if (queue->tail == queue->head)
 		return UINT32_MAX;		/* empty */
 
-	result = queue->ios[queue->tail];
+	result = queue->sqes[queue->tail];
 	queue->tail = (queue->tail + 1) & (queue->size - 1);
 
 	return result;
@@ -240,37 +241,37 @@ pgaio_worker_needs_synchronous_execution(PgAioHandle *ioh)
 }
 
 static void
-pgaio_worker_submit_internal(int nios, PgAioHandle *ios[])
+pgaio_worker_submit_internal(int num_staged_ios, PgAioHandle **staged_ios)
 {
 	PgAioHandle *synchronous_ios[PGAIO_SUBMIT_BATCH_SIZE];
 	int			nsync = 0;
 	Latch	   *wakeup = NULL;
 	int			worker;
 
-	Assert(nios <= PGAIO_SUBMIT_BATCH_SIZE);
+	Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
 
 	LWLockAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE);
-	for (int i = 0; i < nios; ++i)
+	for (int i = 0; i < num_staged_ios; ++i)
 	{
-		Assert(!pgaio_worker_needs_synchronous_execution(ios[i]));
-		if (!pgaio_worker_submission_queue_insert(ios[i]))
+		Assert(!pgaio_worker_needs_synchronous_execution(staged_ios[i]));
+		if (!pgaio_worker_submission_queue_insert(staged_ios[i]))
 		{
 			/*
 			 * We'll do it synchronously, but only after we've sent as many as
 			 * we can to workers, to maximize concurrency.
 			 */
-			synchronous_ios[nsync++] = ios[i];
+			synchronous_ios[nsync++] = staged_ios[i];
 			continue;
 		}
 
 		if (wakeup == NULL)
 		{
 			/* Choose an idle worker to wake up if we haven't already. */
-			worker = pgaio_choose_idle_worker();
+			worker = pgaio_worker_choose_idle();
 			if (worker >= 0)
 				wakeup = io_worker_control->workers[worker].latch;
 
-			pgaio_debug_io(DEBUG4, ios[i],
+			pgaio_debug_io(DEBUG4, staged_ios[i],
 						   "choosing worker %d",
 						   worker);
 		}
@@ -316,6 +317,7 @@ pgaio_worker_die(int code, Datum arg)
 	Assert(io_worker_control->workers[MyIoWorkerId].in_use);
 	Assert(io_worker_control->workers[MyIoWorkerId].latch == MyLatch);
 
+	io_worker_control->idle_worker_mask &= ~(UINT64_C(1) << MyIoWorkerId);
 	io_worker_control->workers[MyIoWorkerId].in_use = false;
 	io_worker_control->workers[MyIoWorkerId].latch = NULL;
 	LWLockRelease(AioWorkerSubmissionQueueLock);
@@ -488,7 +490,7 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 						   IO_WORKER_WAKEUP_FANOUT);
 			for (int i = 0; i < nwakeups; ++i)
 			{
-				if ((worker = pgaio_choose_idle_worker()) < 0)
+				if ((worker = pgaio_worker_choose_idle()) < 0)
 					break;
 				latches[nlatches++] = io_worker_control->workers[worker].latch;
 			}
@@ -573,6 +575,12 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 		}
 
 		CHECK_FOR_INTERRUPTS();
+
+		if (ConfigReloadPending)
+		{
+			ConfigReloadPending = false;
+			ProcessConfigFile(PGC_SIGHUP);
+		}
 	}
 
 	error_context_stack = errcallback.previous;
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index bd68d7e0ca9..6afdd28dba6 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3339,10 +3339,10 @@ UnpinBufferNoOwner(BufferDesc *buf)
  * BufferSync -- Write out all dirty buffers in the pool.
  *
  * This is called at checkpoint time to write out all dirty shared buffers.
- * The checkpoint request flags should be passed in.  If CHECKPOINT_IMMEDIATE
- * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
- * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
- * unlogged buffers, which are otherwise skipped.  The remaining flags
+ * The checkpoint request flags should be passed in.  If CHECKPOINT_FAST is
+ * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
+ * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write
+ * even unlogged buffers, which are otherwise skipped.  The remaining flags
  * currently have no effect here.
  */
 static void
@@ -3367,7 +3367,7 @@ BufferSync(int flags)
 	 * recovery, we write all dirty buffers.
 	 */
 	if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
-					CHECKPOINT_FLUSH_ALL))))
+					CHECKPOINT_FLUSH_UNLOGGED))))
 		mask |= BM_PERMANENT;
 
 	/*
diff --git a/src/backend/storage/ipc/dsm_registry.c b/src/backend/storage/ipc/dsm_registry.c
index 828c2ff0c7f..1682cc6d34c 100644
--- a/src/backend/storage/ipc/dsm_registry.c
+++ b/src/backend/storage/ipc/dsm_registry.c
@@ -40,10 +40,12 @@
 
 #include "postgres.h"
 
+#include "funcapi.h"
 #include "lib/dshash.h"
 #include "storage/dsm_registry.h"
 #include "storage/lwlock.h"
 #include "storage/shmem.h"
+#include "utils/builtins.h"
 #include "utils/memutils.h"
 
 #define DSMR_NAME_LEN				128
@@ -88,6 +90,13 @@ typedef enum DSMREntryType
 	DSMR_ENTRY_TYPE_DSH,
 } DSMREntryType;
 
+static const char *const DSMREntryTypeNames[] =
+{
+	[DSMR_ENTRY_TYPE_DSM] = "segment",
+	[DSMR_ENTRY_TYPE_DSA] = "area",
+	[DSMR_ENTRY_TYPE_DSH] = "hash",
+};
+
 typedef struct DSMRegistryEntry
 {
 	char		name[DSMR_NAME_LEN];
@@ -435,3 +444,43 @@ GetNamedDSHash(const char *name, const dshash_parameters *params, bool *found)
 
 	return ret;
 }
+
+Datum
+pg_get_dsm_registry_allocations(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	DSMRegistryEntry *entry;
+	MemoryContext oldcontext;
+	dshash_seq_status status;
+
+	InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC);
+
+	/* Be sure any local memory allocated by DSM/DSA routines is persistent. */
+	oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+	init_dsm_registry();
+	MemoryContextSwitchTo(oldcontext);
+
+	dshash_seq_init(&status, dsm_registry_table, false);
+	while ((entry = dshash_seq_next(&status)) != NULL)
+	{
+		Datum		vals[3];
+		bool		nulls[3] = {0};
+
+		vals[0] = CStringGetTextDatum(entry->name);
+		vals[1] = CStringGetTextDatum(DSMREntryTypeNames[entry->type]);
+
+		/*
+		 * Since we can't know the size of DSA/dshash entries without first
+		 * attaching to them, return NULL for those.
+		 */
+		if (entry->type == DSMR_ENTRY_TYPE_DSM)
+			vals[2] = Int64GetDatum(entry->data.dsm.size);
+		else
+			nulls[2] = true;
+
+		tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, vals, nulls);
+	}
+	dshash_seq_term(&status);
+
+	return (Datum) 0;
+}
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index e5b945a9ee3..2418967def6 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -1622,58 +1622,6 @@ TransactionIdIsInProgress(TransactionId xid)
 	return false;
 }
 
-/*
- * TransactionIdIsActive -- is xid the top-level XID of an active backend?
- *
- * This differs from TransactionIdIsInProgress in that it ignores prepared
- * transactions, as well as transactions running on the primary if we're in
- * hot standby.  Also, we ignore subtransactions since that's not needed
- * for current uses.
- */
-bool
-TransactionIdIsActive(TransactionId xid)
-{
-	bool		result = false;
-	ProcArrayStruct *arrayP = procArray;
-	TransactionId *other_xids = ProcGlobal->xids;
-	int			i;
-
-	/*
-	 * Don't bother checking a transaction older than RecentXmin; it could not
-	 * possibly still be running.
-	 */
-	if (TransactionIdPrecedes(xid, RecentXmin))
-		return false;
-
-	LWLockAcquire(ProcArrayLock, LW_SHARED);
-
-	for (i = 0; i < arrayP->numProcs; i++)
-	{
-		int			pgprocno = arrayP->pgprocnos[i];
-		PGPROC	   *proc = &allProcs[pgprocno];
-		TransactionId pxid;
-
-		/* Fetch xid just once - see GetNewTransactionId */
-		pxid = UINT32_ACCESS_ONCE(other_xids[i]);
-
-		if (!TransactionIdIsValid(pxid))
-			continue;
-
-		if (proc->pid == 0)
-			continue;			/* ignore prepared transactions */
-
-		if (TransactionIdEquals(pxid, xid))
-		{
-			result = true;
-			break;
-		}
-	}
-
-	LWLockRelease(ProcArrayLock);
-
-	return result;
-}
-
 
 /*
  * Determine XID horizons.
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 7fa8d9247e0..4222bdab078 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -1376,7 +1376,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
 
 	if (xlrec.subxid_overflow)
 		elog(DEBUG2,
-			 "snapshot of %d running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
+			 "snapshot of %d running transactions overflowed (lsn %X/%08X oldest xid %u latest complete %u next xid %u)",
 			 CurrRunningXacts->xcnt,
 			 LSN_FORMAT_ARGS(recptr),
 			 CurrRunningXacts->oldestRunningXid,
@@ -1384,7 +1384,7 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
 			 CurrRunningXacts->nextXid);
 	else
 		elog(DEBUG2,
-			 "snapshot of %d+%d running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
+			 "snapshot of %d+%d running transaction ids (lsn %X/%08X oldest xid %u latest complete %u next xid %u)",
 			 CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
 			 LSN_FORMAT_ARGS(recptr),
 			 CurrRunningXacts->oldestRunningXid,
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 2776ceb295b..62f3471448e 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -3539,9 +3539,9 @@ AtPrepare_Locks(void)
  * but that probably costs more cycles.
  */
 void
-PostPrepare_Locks(TransactionId xid)
+PostPrepare_Locks(FullTransactionId fxid)
 {
-	PGPROC	   *newproc = TwoPhaseGetDummyProc(xid, false);
+	PGPROC	   *newproc = TwoPhaseGetDummyProc(fxid, false);
 	HASH_SEQ_STATUS status;
 	LOCALLOCK  *locallock;
 	LOCK	   *lock;
@@ -4324,11 +4324,11 @@ DumpAllLocks(void)
  * and PANIC anyway.
  */
 void
-lock_twophase_recover(TransactionId xid, uint16 info,
+lock_twophase_recover(FullTransactionId fxid, uint16 info,
 					  void *recdata, uint32 len)
 {
 	TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
-	PGPROC	   *proc = TwoPhaseGetDummyProc(xid, false);
+	PGPROC	   *proc = TwoPhaseGetDummyProc(fxid, false);
 	LOCKTAG    *locktag;
 	LOCKMODE	lockmode;
 	LOCKMETHODID lockmethodid;
@@ -4505,7 +4505,7 @@ lock_twophase_recover(TransactionId xid, uint16 info,
  * starting up into hot standby mode.
  */
 void
-lock_twophase_standby_recover(TransactionId xid, uint16 info,
+lock_twophase_standby_recover(FullTransactionId fxid, uint16 info,
 							  void *recdata, uint32 len)
 {
 	TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
@@ -4524,7 +4524,7 @@ lock_twophase_standby_recover(TransactionId xid, uint16 info,
 	if (lockmode == AccessExclusiveLock &&
 		locktag->locktag_type == LOCKTAG_RELATION)
 	{
-		StandbyAcquireAccessExclusiveLock(xid,
+		StandbyAcquireAccessExclusiveLock(XidFromFullTransactionId(fxid),
 										  locktag->locktag_field1 /* dboid */ ,
 										  locktag->locktag_field2 /* reloid */ );
 	}
@@ -4537,11 +4537,11 @@ lock_twophase_standby_recover(TransactionId xid, uint16 info,
  * Find and release the lock indicated by the 2PC record.
  */
 void
-lock_twophase_postcommit(TransactionId xid, uint16 info,
+lock_twophase_postcommit(FullTransactionId fxid, uint16 info,
 						 void *recdata, uint32 len)
 {
 	TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
-	PGPROC	   *proc = TwoPhaseGetDummyProc(xid, true);
+	PGPROC	   *proc = TwoPhaseGetDummyProc(fxid, true);
 	LOCKTAG    *locktag;
 	LOCKMETHODID lockmethodid;
 	LockMethod	lockMethodTable;
@@ -4563,10 +4563,10 @@ lock_twophase_postcommit(TransactionId xid, uint16 info,
  * This is actually just the same as the COMMIT case.
  */
 void
-lock_twophase_postabort(TransactionId xid, uint16 info,
+lock_twophase_postabort(FullTransactionId fxid, uint16 info,
 						void *recdata, uint32 len)
 {
-	lock_twophase_postcommit(xid, info, recdata, len);
+	lock_twophase_postcommit(fxid, info, recdata, len);
 }
 
 /*
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index d82114ffca1..c07fb588355 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -191,7 +191,7 @@
  *		AtPrepare_PredicateLocks(void);
  *		PostPrepare_PredicateLocks(TransactionId xid);
  *		PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit);
- *		predicatelock_twophase_recover(TransactionId xid, uint16 info,
+ *		predicatelock_twophase_recover(FullTransactionId fxid, uint16 info,
  *									   void *recdata, uint32 len);
  */
 
@@ -4856,7 +4856,7 @@ AtPrepare_PredicateLocks(void)
  *		anyway. We only need to clean up our local state.
  */
 void
-PostPrepare_PredicateLocks(TransactionId xid)
+PostPrepare_PredicateLocks(FullTransactionId fxid)
 {
 	if (MySerializableXact == InvalidSerializableXact)
 		return;
@@ -4879,12 +4879,12 @@ PostPrepare_PredicateLocks(TransactionId xid)
  *		commits or aborts.
  */
 void
-PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit)
+PredicateLockTwoPhaseFinish(FullTransactionId fxid, bool isCommit)
 {
 	SERIALIZABLEXID *sxid;
 	SERIALIZABLEXIDTAG sxidtag;
 
-	sxidtag.xid = xid;
+	sxidtag.xid = XidFromFullTransactionId(fxid);
 
 	LWLockAcquire(SerializableXactHashLock, LW_SHARED);
 	sxid = (SERIALIZABLEXID *)
@@ -4906,10 +4906,11 @@ PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit)
  * Re-acquire a predicate lock belonging to a transaction that was prepared.
  */
 void
-predicatelock_twophase_recover(TransactionId xid, uint16 info,
+predicatelock_twophase_recover(FullTransactionId fxid, uint16 info,
 							   void *recdata, uint32 len)
 {
 	TwoPhasePredicateRecord *record;
+	TransactionId xid = XidFromFullTransactionId(fxid);
 
 	Assert(len == sizeof(TwoPhasePredicateRecord));