aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/access/transam/xlog.c33
-rw-r--r--src/backend/backup/basebackup_incremental.c90
-rw-r--r--src/backend/postmaster/walsummarizer.c142
-rw-r--r--src/bin/pg_combinebackup/meson.build1
-rw-r--r--src/bin/pg_combinebackup/t/008_promote.pl81
-rw-r--r--src/include/access/xlog.h1
-rw-r--r--src/include/postmaster/walsummarizer.h3
7 files changed, 241 insertions, 110 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index bb2685304e4..7f136026277 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -500,6 +500,11 @@ typedef struct XLogCtlData
* If we create a new timeline when the system was started up,
* PrevTimeLineID is the old timeline's ID that we forked off from.
* Otherwise it's equal to InsertTimeLineID.
+ *
+ * We set these fields while holding info_lck. Most that reads these
+ * values knows that recovery is no longer in progress and so can safely
+ * read the value without a lock, but code that could be run either during
+ * or after recovery can take info_lck while reading these values.
*/
TimeLineID InsertTimeLineID;
TimeLineID PrevTimeLineID;
@@ -5317,6 +5322,13 @@ CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
char partialfname[MAXFNAMELEN];
char partialpath[MAXPGPATH];
+ /*
+ * If we're summarizing WAL, we can't rename the partial file
+ * until the summarizer finishes with it, else it will fail.
+ */
+ if (summarize_wal)
+ WaitForWalSummarization(EndOfLog);
+
XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
@@ -5947,8 +5959,10 @@ StartupXLOG(void)
}
/* Save the selected TimeLineID in shared memory, too */
+ SpinLockAcquire(&XLogCtl->info_lck);
XLogCtl->InsertTimeLineID = newTLI;
XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
+ SpinLockRelease(&XLogCtl->info_lck);
/*
* Actually, if WAL ended in an incomplete record, skip the parts that
@@ -6484,6 +6498,25 @@ GetWALInsertionTimeLine(void)
}
/*
+ * GetWALInsertionTimeLineIfSet -- If the system is not in recovery, returns
+ * the WAL insertion timeline; else, returns 0. Wherever possible, use
+ * GetWALInsertionTimeLine() instead, since it's cheaper. Note that this
+ * function decides recovery has ended as soon as the insert TLI is set, which
+ * happens before we set XLogCtl->SharedRecoveryState to RECOVERY_STATE_DONE.
+ */
+TimeLineID
+GetWALInsertionTimeLineIfSet(void)
+{
+ TimeLineID insertTLI;
+
+ SpinLockAcquire(&XLogCtl->info_lck);
+ insertTLI = XLogCtl->InsertTimeLineID;
+ SpinLockRelease(&XLogCtl->info_lck);
+
+ return insertTLI;
+}
+
+/*
* GetLastImportantRecPtr -- Returns the LSN of the last important record
* inserted. All records not explicitly marked as unimportant are considered
* important.
diff --git a/src/backend/backup/basebackup_incremental.c b/src/backend/backup/basebackup_incremental.c
index cc2e168129a..af361f38a6c 100644
--- a/src/backend/backup/basebackup_incremental.c
+++ b/src/backend/backup/basebackup_incremental.c
@@ -277,12 +277,6 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib,
TimeLineID earliest_wal_range_tli = 0;
XLogRecPtr earliest_wal_range_start_lsn = InvalidXLogRecPtr;
TimeLineID latest_wal_range_tli = 0;
- XLogRecPtr summarized_lsn;
- XLogRecPtr pending_lsn;
- XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr;
- int deadcycles = 0;
- TimestampTz initial_time,
- current_time;
Assert(ib->buf.data == NULL);
@@ -458,85 +452,13 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib,
}
/*
- * Wait for WAL summarization to catch up to the backup start LSN (but
- * time out if it doesn't do so quickly enough).
+ * Wait for WAL summarization to catch up to the backup start LSN. This
+ * will throw an error if the WAL summarizer appears to be stuck. If WAL
+ * summarization gets disabled while we're waiting, this will return
+ * immediately, and we'll error out further down if the WAL summaries are
+ * incomplete.
*/
- initial_time = current_time = GetCurrentTimestamp();
- while (1)
- {
- long timeout_in_ms = 10000;
- long elapsed_seconds;
-
- /*
- * Align the wait time to prevent drift. This doesn't really matter,
- * but we'd like the warnings about how long we've been waiting to say
- * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
- * drifting to something that is not a multiple of ten.
- */
- timeout_in_ms -=
- TimestampDifferenceMilliseconds(initial_time, current_time) %
- timeout_in_ms;
-
- /* Wait for up to 10 seconds. */
- summarized_lsn = WaitForWalSummarization(backup_state->startpoint,
- timeout_in_ms, &pending_lsn);
-
- /* If WAL summarization has progressed sufficiently, stop waiting. */
- if (summarized_lsn >= backup_state->startpoint)
- break;
-
- /*
- * Keep track of the number of cycles during which there has been no
- * progression of pending_lsn. If pending_lsn is not advancing, that
- * means that not only are no new files appearing on disk, but we're
- * not even incorporating new records into the in-memory state.
- */
- if (pending_lsn > prior_pending_lsn)
- {
- prior_pending_lsn = pending_lsn;
- deadcycles = 0;
- }
- else
- ++deadcycles;
-
- /*
- * If we've managed to wait for an entire minute without the WAL
- * summarizer absorbing a single WAL record, error out; probably
- * something is wrong.
- *
- * We could consider also erroring out if the summarizer is taking too
- * long to catch up, but it's not clear what rate of progress would be
- * acceptable and what would be too slow. So instead, we just try to
- * error out in the case where there's no progress at all. That seems
- * likely to catch a reasonable number of the things that can go wrong
- * in practice (e.g. the summarizer process is completely hung, say
- * because somebody hooked up a debugger to it or something) without
- * giving up too quickly when the system is just slow.
- */
- if (deadcycles >= 6)
- ereport(ERROR,
- (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("WAL summarization is not progressing"),
- errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
- LSN_FORMAT_ARGS(backup_state->startpoint),
- LSN_FORMAT_ARGS(summarized_lsn),
- LSN_FORMAT_ARGS(pending_lsn))));
-
- /*
- * Otherwise, just let the user know what's happening.
- */
- current_time = GetCurrentTimestamp();
- elapsed_seconds =
- TimestampDifferenceMilliseconds(initial_time, current_time) / 1000;
- ereport(WARNING,
- (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("still waiting for WAL summarization through %X/%X after %ld seconds",
- LSN_FORMAT_ARGS(backup_state->startpoint),
- elapsed_seconds),
- errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
- LSN_FORMAT_ARGS(summarized_lsn),
- LSN_FORMAT_ARGS(pending_lsn))));
- }
+ WaitForWalSummarization(backup_state->startpoint);
/*
* Retrieve a list of all WAL summaries on any timeline that overlap with
diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c
index 6bbc0405107..b62e2c36de9 100644
--- a/src/backend/postmaster/walsummarizer.c
+++ b/src/backend/postmaster/walsummarizer.c
@@ -650,54 +650,132 @@ SetWalSummarizerLatch(void)
}
/*
- * Wait until WAL summarization reaches the given LSN, but not longer than
- * the given timeout.
+ * Wait until WAL summarization reaches the given LSN, but time out with an
+ * error if the summarizer seems to be stick.
*
- * The return value is the first still-unsummarized LSN. If it's greater than
- * or equal to the passed LSN, then that LSN was reached. If not, we timed out.
- *
- * Either way, *pending_lsn is set to the value taken from WalSummarizerCtl.
+ * Returns immediately if summarize_wal is turned off while we wait. Caller
+ * is expected to handle this case, if necessary.
*/
-XLogRecPtr
-WaitForWalSummarization(XLogRecPtr lsn, long timeout, XLogRecPtr *pending_lsn)
+void
+WaitForWalSummarization(XLogRecPtr lsn)
{
- TimestampTz start_time = GetCurrentTimestamp();
- TimestampTz deadline = TimestampTzPlusMilliseconds(start_time, timeout);
- XLogRecPtr summarized_lsn;
+ TimestampTz initial_time,
+ cycle_time,
+ current_time;
+ XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr;
+ int deadcycles = 0;
- Assert(!XLogRecPtrIsInvalid(lsn));
- Assert(timeout > 0);
+ initial_time = cycle_time = GetCurrentTimestamp();
while (1)
{
- TimestampTz now;
- long remaining_timeout;
+ long timeout_in_ms = 10000;
+ XLogRecPtr summarized_lsn;
+ XLogRecPtr pending_lsn;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /* If WAL summarization is disabled while we're waiting, give up. */
+ if (!summarize_wal)
+ return;
/*
* If the LSN summarized on disk has reached the target value, stop.
*/
LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
summarized_lsn = WalSummarizerCtl->summarized_lsn;
- *pending_lsn = WalSummarizerCtl->pending_lsn;
+ pending_lsn = WalSummarizerCtl->pending_lsn;
LWLockRelease(WALSummarizerLock);
+
+ /* If WAL summarization has progressed sufficiently, stop waiting. */
if (summarized_lsn >= lsn)
break;
- /* Timeout reached? If yes, stop. */
- now = GetCurrentTimestamp();
- remaining_timeout = TimestampDifferenceMilliseconds(now, deadline);
- if (remaining_timeout <= 0)
- break;
+ /* Recheck current time. */
+ current_time = GetCurrentTimestamp();
+
+ /* Have we finished the current cycle of waiting? */
+ if (TimestampDifferenceMilliseconds(cycle_time,
+ current_time) >= timeout_in_ms)
+ {
+ long elapsed_seconds;
+
+ /* Begin new wait cycle. */
+ cycle_time = TimestampTzPlusMilliseconds(cycle_time,
+ timeout_in_ms);
+
+ /*
+ * Keep track of the number of cycles during which there has been
+ * no progression of pending_lsn. If pending_lsn is not advancing,
+ * that means that not only are no new files appearing on disk,
+ * but we're not even incorporating new records into the in-memory
+ * state.
+ */
+ if (pending_lsn > prior_pending_lsn)
+ {
+ prior_pending_lsn = pending_lsn;
+ deadcycles = 0;
+ }
+ else
+ ++deadcycles;
+
+ /*
+ * If we've managed to wait for an entire minute without the WAL
+ * summarizer absorbing a single WAL record, error out; probably
+ * something is wrong.
+ *
+ * We could consider also erroring out if the summarizer is taking
+ * too long to catch up, but it's not clear what rate of progress
+ * would be acceptable and what would be too slow. So instead, we
+ * just try to error out in the case where there's no progress at
+ * all. That seems likely to catch a reasonable number of the
+ * things that can go wrong in practice (e.g. the summarizer
+ * process is completely hung, say because somebody hooked up a
+ * debugger to it or something) without giving up too quickly when
+ * the system is just slow.
+ */
+ if (deadcycles >= 6)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("WAL summarization is not progressing"),
+ errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
+ LSN_FORMAT_ARGS(lsn),
+ LSN_FORMAT_ARGS(summarized_lsn),
+ LSN_FORMAT_ARGS(pending_lsn))));
+
+
+ /*
+ * Otherwise, just let the user know what's happening.
+ */
+ elapsed_seconds =
+ TimestampDifferenceMilliseconds(initial_time,
+ current_time) / 1000;
+ ereport(WARNING,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("still waiting for WAL summarization through %X/%X after %ld seconds",
+ LSN_FORMAT_ARGS(lsn),
+ elapsed_seconds),
+ errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
+ LSN_FORMAT_ARGS(summarized_lsn),
+ LSN_FORMAT_ARGS(pending_lsn))));
+ }
+
+ /*
+ * Align the wait time to prevent drift. This doesn't really matter,
+ * but we'd like the warnings about how long we've been waiting to say
+ * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
+ * drifting to something that is not a multiple of ten.
+ */
+ timeout_in_ms -=
+ TimestampDifferenceMilliseconds(cycle_time, current_time);
/* Wait and see. */
ConditionVariableTimedSleep(&WalSummarizerCtl->summary_file_cv,
- remaining_timeout,
+ timeout_in_ms,
WAIT_EVENT_WAL_SUMMARY_READY);
}
ConditionVariableCancelSleep();
-
- return summarized_lsn;
}
/*
@@ -730,6 +808,22 @@ GetLatestLSN(TimeLineID *tli)
TimeLineID flush_tli;
XLogRecPtr replay_lsn;
TimeLineID replay_tli;
+ TimeLineID insert_tli;
+
+ /*
+ * After the insert TLI has been set and before the control file has
+ * been updated to show the DB in production, RecoveryInProgress()
+ * will return true, because it's not yet safe for all backends to
+ * begin writing WAL. However, replay has already ceased, so from our
+ * point of view, recovery is already over. We should summarize up to
+ * where replay stopped and then prepare to resume at the start of the
+ * insert timeline.
+ */
+ if ((insert_tli = GetWALInsertionTimeLineIfSet()) != 0)
+ {
+ *tli = insert_tli;
+ return GetXLogReplayRecPtr(NULL);
+ }
/*
* What we really want to know is how much WAL has been flushed to
diff --git a/src/bin/pg_combinebackup/meson.build b/src/bin/pg_combinebackup/meson.build
index d871b2e3b80..d142608e949 100644
--- a/src/bin/pg_combinebackup/meson.build
+++ b/src/bin/pg_combinebackup/meson.build
@@ -35,6 +35,7 @@ tests += {
't/005_integrity.pl',
't/006_db_file_copy.pl',
't/007_wal_level_minimal.pl',
+ 't/008_promote.pl',
],
}
}
diff --git a/src/bin/pg_combinebackup/t/008_promote.pl b/src/bin/pg_combinebackup/t/008_promote.pl
new file mode 100644
index 00000000000..1154a5d8b22
--- /dev/null
+++ b/src/bin/pg_combinebackup/t/008_promote.pl
@@ -0,0 +1,81 @@
+# Copyright (c) 2021-2024, PostgreSQL Global Development Group
+#
+# Test whether WAL summaries are complete such that incremental backup
+# can be performed after promoting a standby at an arbitrary LSN.
+
+use strict;
+use warnings FATAL => 'all';
+use File::Compare;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Can be changed to test the other modes.
+my $mode = $ENV{PG_TEST_PG_COMBINEBACKUP_MODE} || '--copy';
+
+note "testing using mode $mode";
+
+# Set up a new database instance.
+my $node1 = PostgreSQL::Test::Cluster->new('node1');
+$node1->init(has_archiving => 1, allows_streaming => 1);
+$node1->append_conf('postgresql.conf', 'summarize_wal = on');
+$node1->append_conf('postgresql.conf', 'log_min_messages = debug1');
+$node1->start;
+
+# Create a table and insert a test row into it.
+$node1->safe_psql('postgres', <<EOM);
+CREATE TABLE mytable (a int, b text);
+INSERT INTO mytable VALUES (1, 'avocado');
+EOM
+
+# Take a full backup.
+my $backup1path = $node1->backup_dir . '/backup1';
+$node1->command_ok(
+ [ 'pg_basebackup', '-D', $backup1path, '--no-sync', '-cfast' ],
+ "full backup from node1");
+
+# Checkpoint and record LSN after.
+$node1->safe_psql('postgres', 'CHECKPOINT');
+my $lsn = $node1->safe_psql('postgres', 'SELECT pg_current_wal_insert_lsn()');
+
+# Insert a second row on the original node.
+$node1->safe_psql('postgres', <<EOM);
+INSERT INTO mytable VALUES (2, 'beetle');
+EOM
+
+# Now create a second node. We want this to stream from the first node and
+# then stop recovery at some arbitrary LSN, not just when it hits the end of
+# WAL, so use a recovery target.
+my $node2 = PostgreSQL::Test::Cluster->new('node2');
+$node2->init_from_backup($node1, 'backup1', 'has_streaming' => 1);
+$node2->append_conf('postgresql.conf', <<EOM);
+recovery_target_lsn = '$lsn'
+recovery_target_action = 'pause'
+EOM
+$node2->start();
+
+# Wait until recoveery pauses, then promote.
+$node2->poll_query_until('postgres', "SELECT pg_get_wal_replay_pause_state() = 'paused';");
+$node2->safe_psql('postgres', "SELECT pg_promote()");
+
+# Once promotion occurs, insert a second row on the new node.
+$node2->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';");
+$node2->safe_psql('postgres', <<EOM);
+INSERT INTO mytable VALUES (2, 'blackberry');
+EOM
+
+# Now take an incremental backup. If WAL summarization didn't follow the
+# timeline cange correctly, something should break at this point.
+my $backup2path = $node1->backup_dir . '/backup2';
+$node2->command_ok(
+ [ 'pg_basebackup', '-D', $backup2path, '--no-sync', '-cfast',
+ '--incremental', $backup1path . '/backup_manifest' ],
+ "incremental backup from node2");
+
+# Restore the incremental backup and use it to create a new node.
+my $node3 = PostgreSQL::Test::Cluster->new('node3');
+$node3->init_from_backup($node1, 'backup2',
+ combine_with_prior => [ 'backup1' ]);
+$node3->start();
+
+done_testing();
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 1a1f11a943f..2c507ea618c 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -248,6 +248,7 @@ extern XLogRecPtr GetRedoRecPtr(void);
extern XLogRecPtr GetInsertRecPtr(void);
extern XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI);
extern TimeLineID GetWALInsertionTimeLine(void);
+extern TimeLineID GetWALInsertionTimeLineIfSet(void);
extern XLogRecPtr GetLastImportantRecPtr(void);
extern void SetWalWriterSleeping(bool sleeping);
diff --git a/src/include/postmaster/walsummarizer.h b/src/include/postmaster/walsummarizer.h
index 112bc1e6cba..aedca556764 100644
--- a/src/include/postmaster/walsummarizer.h
+++ b/src/include/postmaster/walsummarizer.h
@@ -30,7 +30,6 @@ extern void GetWalSummarizerState(TimeLineID *summarized_tli,
extern XLogRecPtr GetOldestUnsummarizedLSN(TimeLineID *tli,
bool *lsn_is_exact);
extern void SetWalSummarizerLatch(void);
-extern XLogRecPtr WaitForWalSummarization(XLogRecPtr lsn, long timeout,
- XLogRecPtr *pending_lsn);
+extern void WaitForWalSummarization(XLogRecPtr lsn);
#endif