7 files changed, 241 insertions, 110 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index bb2685304e4..7f136026277 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -500,6 +500,11 @@ typedef struct XLogCtlData
 	 * If we create a new timeline when the system was started up,
 	 * PrevTimeLineID is the old timeline's ID that we forked off from.
 	 * Otherwise it's equal to InsertTimeLineID.
+	 *
+	 * We set these fields while holding info_lck. Most that reads these
+	 * values knows that recovery is no longer in progress and so can safely
+	 * read the value without a lock, but code that could be run either during
+	 * or after recovery can take info_lck while reading these values.
 	 */
 	TimeLineID	InsertTimeLineID;
 	TimeLineID	PrevTimeLineID;
@@ -5317,6 +5322,13 @@ CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
 			char		partialfname[MAXFNAMELEN];
 			char		partialpath[MAXPGPATH];
 
+			/*
+			 * If we're summarizing WAL, we can't rename the partial file
+			 * until the summarizer finishes with it, else it will fail.
+			 */
+			if (summarize_wal)
+				WaitForWalSummarization(EndOfLog);
+
 			XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
 			snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
 			snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
@@ -5947,8 +5959,10 @@ StartupXLOG(void)
 	}
 
 	/* Save the selected TimeLineID in shared memory, too */
+	SpinLockAcquire(&XLogCtl->info_lck);
 	XLogCtl->InsertTimeLineID = newTLI;
 	XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
+	SpinLockRelease(&XLogCtl->info_lck);
 
 	/*
 	 * Actually, if WAL ended in an incomplete record, skip the parts that
@@ -6484,6 +6498,25 @@ GetWALInsertionTimeLine(void)
 }
 
 /*
+ * GetWALInsertionTimeLineIfSet -- If the system is not in recovery, returns
+ * the WAL insertion timeline; else, returns 0. Wherever possible, use
+ * GetWALInsertionTimeLine() instead, since it's cheaper. Note that this
+ * function decides recovery has ended as soon as the insert TLI is set, which
+ * happens before we set XLogCtl->SharedRecoveryState to RECOVERY_STATE_DONE.
+ */
+TimeLineID
+GetWALInsertionTimeLineIfSet(void)
+{
+	TimeLineID	insertTLI;
+
+	SpinLockAcquire(&XLogCtl->info_lck);
+	insertTLI = XLogCtl->InsertTimeLineID;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	return insertTLI;
+}
+
+/*
  * GetLastImportantRecPtr -- Returns the LSN of the last important record
  * inserted. All records not explicitly marked as unimportant are considered
  * important.
diff --git a/src/backend/backup/basebackup_incremental.c b/src/backend/backup/basebackup_incremental.c
index cc2e168129a..af361f38a6c 100644
--- a/src/backend/backup/basebackup_incremental.c
+++ b/src/backend/backup/basebackup_incremental.c
@@ -277,12 +277,6 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib,
 	TimeLineID	earliest_wal_range_tli = 0;
 	XLogRecPtr	earliest_wal_range_start_lsn = InvalidXLogRecPtr;
 	TimeLineID	latest_wal_range_tli = 0;
-	XLogRecPtr	summarized_lsn;
-	XLogRecPtr	pending_lsn;
-	XLogRecPtr	prior_pending_lsn = InvalidXLogRecPtr;
-	int			deadcycles = 0;
-	TimestampTz initial_time,
-				current_time;
 
 	Assert(ib->buf.data == NULL);
 
@@ -458,85 +452,13 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib,
 	}
 
 	/*
-	 * Wait for WAL summarization to catch up to the backup start LSN (but
-	 * time out if it doesn't do so quickly enough).
+	 * Wait for WAL summarization to catch up to the backup start LSN. This
+	 * will throw an error if the WAL summarizer appears to be stuck. If WAL
+	 * summarization gets disabled while we're waiting, this will return
+	 * immediately, and we'll error out further down if the WAL summaries are
+	 * incomplete.
 	 */
-	initial_time = current_time = GetCurrentTimestamp();
-	while (1)
-	{
-		long		timeout_in_ms = 10000;
-		long		elapsed_seconds;
-
-		/*
-		 * Align the wait time to prevent drift. This doesn't really matter,
-		 * but we'd like the warnings about how long we've been waiting to say
-		 * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
-		 * drifting to something that is not a multiple of ten.
-		 */
-		timeout_in_ms -=
-			TimestampDifferenceMilliseconds(initial_time, current_time) %
-			timeout_in_ms;
-
-		/* Wait for up to 10 seconds. */
-		summarized_lsn = WaitForWalSummarization(backup_state->startpoint,
-												 timeout_in_ms, &pending_lsn);
-
-		/* If WAL summarization has progressed sufficiently, stop waiting. */
-		if (summarized_lsn >= backup_state->startpoint)
-			break;
-
-		/*
-		 * Keep track of the number of cycles during which there has been no
-		 * progression of pending_lsn. If pending_lsn is not advancing, that
-		 * means that not only are no new files appearing on disk, but we're
-		 * not even incorporating new records into the in-memory state.
-		 */
-		if (pending_lsn > prior_pending_lsn)
-		{
-			prior_pending_lsn = pending_lsn;
-			deadcycles = 0;
-		}
-		else
-			++deadcycles;
-
-		/*
-		 * If we've managed to wait for an entire minute without the WAL
-		 * summarizer absorbing a single WAL record, error out; probably
-		 * something is wrong.
-		 *
-		 * We could consider also erroring out if the summarizer is taking too
-		 * long to catch up, but it's not clear what rate of progress would be
-		 * acceptable and what would be too slow. So instead, we just try to
-		 * error out in the case where there's no progress at all. That seems
-		 * likely to catch a reasonable number of the things that can go wrong
-		 * in practice (e.g. the summarizer process is completely hung, say
-		 * because somebody hooked up a debugger to it or something) without
-		 * giving up too quickly when the system is just slow.
-		 */
-		if (deadcycles >= 6)
-			ereport(ERROR,
-					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-					 errmsg("WAL summarization is not progressing"),
-					 errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
-							   LSN_FORMAT_ARGS(backup_state->startpoint),
-							   LSN_FORMAT_ARGS(summarized_lsn),
-							   LSN_FORMAT_ARGS(pending_lsn))));
-
-		/*
-		 * Otherwise, just let the user know what's happening.
-		 */
-		current_time = GetCurrentTimestamp();
-		elapsed_seconds =
-			TimestampDifferenceMilliseconds(initial_time, current_time) / 1000;
-		ereport(WARNING,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("still waiting for WAL summarization through %X/%X after %ld seconds",
-						LSN_FORMAT_ARGS(backup_state->startpoint),
-						elapsed_seconds),
-				 errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
-						   LSN_FORMAT_ARGS(summarized_lsn),
-						   LSN_FORMAT_ARGS(pending_lsn))));
-	}
+	WaitForWalSummarization(backup_state->startpoint);
 
 	/*
 	 * Retrieve a list of all WAL summaries on any timeline that overlap with
diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c
index 6bbc0405107..b62e2c36de9 100644
--- a/src/backend/postmaster/walsummarizer.c
+++ b/src/backend/postmaster/walsummarizer.c
@@ -650,54 +650,132 @@ SetWalSummarizerLatch(void)
 }
 
 /*
- * Wait until WAL summarization reaches the given LSN, but not longer than
- * the given timeout.
+ * Wait until WAL summarization reaches the given LSN, but time out with an
+ * error if the summarizer seems to be stick.
  *
- * The return value is the first still-unsummarized LSN. If it's greater than
- * or equal to the passed LSN, then that LSN was reached. If not, we timed out.
- *
- * Either way, *pending_lsn is set to the value taken from WalSummarizerCtl.
+ * Returns immediately if summarize_wal is turned off while we wait. Caller
+ * is expected to handle this case, if necessary.
  */
-XLogRecPtr
-WaitForWalSummarization(XLogRecPtr lsn, long timeout, XLogRecPtr *pending_lsn)
+void
+WaitForWalSummarization(XLogRecPtr lsn)
 {
-	TimestampTz start_time = GetCurrentTimestamp();
-	TimestampTz deadline = TimestampTzPlusMilliseconds(start_time, timeout);
-	XLogRecPtr	summarized_lsn;
+	TimestampTz initial_time,
+				cycle_time,
+				current_time;
+	XLogRecPtr	prior_pending_lsn = InvalidXLogRecPtr;
+	int			deadcycles = 0;
 
-	Assert(!XLogRecPtrIsInvalid(lsn));
-	Assert(timeout > 0);
+	initial_time = cycle_time = GetCurrentTimestamp();
 
 	while (1)
 	{
-		TimestampTz now;
-		long		remaining_timeout;
+		long		timeout_in_ms = 10000;
+		XLogRecPtr	summarized_lsn;
+		XLogRecPtr	pending_lsn;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* If WAL summarization is disabled while we're waiting, give up. */
+		if (!summarize_wal)
+			return;
 
 		/*
 		 * If the LSN summarized on disk has reached the target value, stop.
 		 */
 		LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
 		summarized_lsn = WalSummarizerCtl->summarized_lsn;
-		*pending_lsn = WalSummarizerCtl->pending_lsn;
+		pending_lsn = WalSummarizerCtl->pending_lsn;
 		LWLockRelease(WALSummarizerLock);
+
+		/* If WAL summarization has progressed sufficiently, stop waiting. */
 		if (summarized_lsn >= lsn)
 			break;
 
-		/* Timeout reached? If yes, stop. */
-		now = GetCurrentTimestamp();
-		remaining_timeout = TimestampDifferenceMilliseconds(now, deadline);
-		if (remaining_timeout <= 0)
-			break;
+		/* Recheck current time. */
+		current_time = GetCurrentTimestamp();
+
+		/* Have we finished the current cycle of waiting? */
+		if (TimestampDifferenceMilliseconds(cycle_time,
+											current_time) >= timeout_in_ms)
+		{
+			long		elapsed_seconds;
+
+			/* Begin new wait cycle. */
+			cycle_time = TimestampTzPlusMilliseconds(cycle_time,
+													 timeout_in_ms);
+
+			/*
+			 * Keep track of the number of cycles during which there has been
+			 * no progression of pending_lsn. If pending_lsn is not advancing,
+			 * that means that not only are no new files appearing on disk,
+			 * but we're not even incorporating new records into the in-memory
+			 * state.
+			 */
+			if (pending_lsn > prior_pending_lsn)
+			{
+				prior_pending_lsn = pending_lsn;
+				deadcycles = 0;
+			}
+			else
+				++deadcycles;
+
+			/*
+			 * If we've managed to wait for an entire minute without the WAL
+			 * summarizer absorbing a single WAL record, error out; probably
+			 * something is wrong.
+			 *
+			 * We could consider also erroring out if the summarizer is taking
+			 * too long to catch up, but it's not clear what rate of progress
+			 * would be acceptable and what would be too slow. So instead, we
+			 * just try to error out in the case where there's no progress at
+			 * all. That seems likely to catch a reasonable number of the
+			 * things that can go wrong in practice (e.g. the summarizer
+			 * process is completely hung, say because somebody hooked up a
+			 * debugger to it or something) without giving up too quickly when
+			 * the system is just slow.
+			 */
+			if (deadcycles >= 6)
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("WAL summarization is not progressing"),
+						 errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
+								   LSN_FORMAT_ARGS(lsn),
+								   LSN_FORMAT_ARGS(summarized_lsn),
+								   LSN_FORMAT_ARGS(pending_lsn))));
+
+
+			/*
+			 * Otherwise, just let the user know what's happening.
+			 */
+			elapsed_seconds =
+				TimestampDifferenceMilliseconds(initial_time,
+												current_time) / 1000;
+			ereport(WARNING,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					 errmsg("still waiting for WAL summarization through %X/%X after %ld seconds",
+							LSN_FORMAT_ARGS(lsn),
+							elapsed_seconds),
+					 errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
+							   LSN_FORMAT_ARGS(summarized_lsn),
+							   LSN_FORMAT_ARGS(pending_lsn))));
+		}
+
+		/*
+		 * Align the wait time to prevent drift. This doesn't really matter,
+		 * but we'd like the warnings about how long we've been waiting to say
+		 * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
+		 * drifting to something that is not a multiple of ten.
+		 */
+		timeout_in_ms -=
+			TimestampDifferenceMilliseconds(cycle_time, current_time);
 
 		/* Wait and see. */
 		ConditionVariableTimedSleep(&WalSummarizerCtl->summary_file_cv,
-									remaining_timeout,
+									timeout_in_ms,
 									WAIT_EVENT_WAL_SUMMARY_READY);
 	}
 
 	ConditionVariableCancelSleep();
-
-	return summarized_lsn;
 }
 
 /*
@@ -730,6 +808,22 @@ GetLatestLSN(TimeLineID *tli)
 		TimeLineID	flush_tli;
 		XLogRecPtr	replay_lsn;
 		TimeLineID	replay_tli;
+		TimeLineID	insert_tli;
+
+		/*
+		 * After the insert TLI has been set and before the control file has
+		 * been updated to show the DB in production, RecoveryInProgress()
+		 * will return true, because it's not yet safe for all backends to
+		 * begin writing WAL. However, replay has already ceased, so from our
+		 * point of view, recovery is already over. We should summarize up to
+		 * where replay stopped and then prepare to resume at the start of the
+		 * insert timeline.
+		 */
+		if ((insert_tli = GetWALInsertionTimeLineIfSet()) != 0)
+		{
+			*tli = insert_tli;
+			return GetXLogReplayRecPtr(NULL);
+		}
 
 		/*
 		 * What we really want to know is how much WAL has been flushed to
diff --git a/src/bin/pg_combinebackup/meson.build b/src/bin/pg_combinebackup/meson.build
index d871b2e3b80..d142608e949 100644
--- a/src/bin/pg_combinebackup/meson.build
+++ b/src/bin/pg_combinebackup/meson.build
@@ -35,6 +35,7 @@ tests += {
       't/005_integrity.pl',
       't/006_db_file_copy.pl',
       't/007_wal_level_minimal.pl',
+      't/008_promote.pl',
     ],
   }
 }
diff --git a/src/bin/pg_combinebackup/t/008_promote.pl b/src/bin/pg_combinebackup/t/008_promote.pl
new file mode 100644
index 00000000000..1154a5d8b22
--- /dev/null
+++ b/src/bin/pg_combinebackup/t/008_promote.pl
@@ -0,0 +1,81 @@
+# Copyright (c) 2021-2024, PostgreSQL Global Development Group
+#
+# Test whether WAL summaries are complete such that incremental backup
+# can be performed after promoting a standby at an arbitrary LSN.
+
+use strict;
+use warnings FATAL => 'all';
+use File::Compare;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Can be changed to test the other modes.
+my $mode = $ENV{PG_TEST_PG_COMBINEBACKUP_MODE} || '--copy';
+
+note "testing using mode $mode";
+
+# Set up a new database instance.
+my $node1 = PostgreSQL::Test::Cluster->new('node1');
+$node1->init(has_archiving => 1, allows_streaming => 1);
+$node1->append_conf('postgresql.conf', 'summarize_wal = on');
+$node1->append_conf('postgresql.conf', 'log_min_messages = debug1');
+$node1->start;
+
+# Create a table and insert a test row into it.
+$node1->safe_psql('postgres', <<EOM);
+CREATE TABLE mytable (a int, b text);
+INSERT INTO mytable VALUES (1, 'avocado');
+EOM
+
+# Take a full backup.
+my $backup1path = $node1->backup_dir . '/backup1';
+$node1->command_ok(
+	[ 'pg_basebackup', '-D', $backup1path, '--no-sync', '-cfast' ],
+	"full backup from node1");
+
+# Checkpoint and record LSN after.
+$node1->safe_psql('postgres', 'CHECKPOINT');
+my $lsn = $node1->safe_psql('postgres', 'SELECT pg_current_wal_insert_lsn()');
+
+# Insert a second row on the original node.
+$node1->safe_psql('postgres', <<EOM);
+INSERT INTO mytable VALUES (2, 'beetle');
+EOM
+
+# Now create a second node. We want this to stream from the first node and
+# then stop recovery at some arbitrary LSN, not just when it hits the end of
+# WAL, so use a recovery target.
+my $node2 = PostgreSQL::Test::Cluster->new('node2');
+$node2->init_from_backup($node1, 'backup1', 'has_streaming' => 1);
+$node2->append_conf('postgresql.conf', <<EOM);
+recovery_target_lsn = '$lsn'
+recovery_target_action = 'pause'
+EOM
+$node2->start();
+
+# Wait until recoveery pauses, then promote.
+$node2->poll_query_until('postgres', "SELECT pg_get_wal_replay_pause_state() = 'paused';");
+$node2->safe_psql('postgres', "SELECT pg_promote()");
+
+# Once promotion occurs, insert a second row on the new node.
+$node2->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';");
+$node2->safe_psql('postgres', <<EOM);
+INSERT INTO mytable VALUES (2, 'blackberry');
+EOM
+
+# Now take an incremental backup. If WAL summarization didn't follow the
+# timeline cange correctly, something should break at this point.
+my $backup2path = $node1->backup_dir . '/backup2';
+$node2->command_ok(
+	[ 'pg_basebackup', '-D', $backup2path, '--no-sync', '-cfast',
+	  '--incremental', $backup1path . '/backup_manifest' ],
+	"incremental backup from node2");
+
+# Restore the incremental backup and use it to create a new node.
+my $node3 = PostgreSQL::Test::Cluster->new('node3');
+$node3->init_from_backup($node1, 'backup2',
+						 combine_with_prior => [ 'backup1' ]);
+$node3->start();
+
+done_testing();
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 1a1f11a943f..2c507ea618c 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -248,6 +248,7 @@ extern XLogRecPtr GetRedoRecPtr(void);
 extern XLogRecPtr GetInsertRecPtr(void);
 extern XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI);
 extern TimeLineID GetWALInsertionTimeLine(void);
+extern TimeLineID GetWALInsertionTimeLineIfSet(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
 
 extern void SetWalWriterSleeping(bool sleeping);
diff --git a/src/include/postmaster/walsummarizer.h b/src/include/postmaster/walsummarizer.h
index 112bc1e6cba..aedca556764 100644
--- a/src/include/postmaster/walsummarizer.h
+++ b/src/include/postmaster/walsummarizer.h
@@ -30,7 +30,6 @@ extern void GetWalSummarizerState(TimeLineID *summarized_tli,
 extern XLogRecPtr GetOldestUnsummarizedLSN(TimeLineID *tli,
 										   bool *lsn_is_exact);
 extern void SetWalSummarizerLatch(void);
-extern XLogRecPtr WaitForWalSummarization(XLogRecPtr lsn, long timeout,
-										  XLogRecPtr *pending_lsn);
+extern void WaitForWalSummarization(XLogRecPtr lsn);
 
 #endif