diff options
-rw-r--r-- | src/backend/access/transam/xlog.c | 33 | ||||
-rw-r--r-- | src/backend/backup/basebackup_incremental.c | 90 | ||||
-rw-r--r-- | src/backend/postmaster/walsummarizer.c | 142 | ||||
-rw-r--r-- | src/bin/pg_combinebackup/meson.build | 1 | ||||
-rw-r--r-- | src/bin/pg_combinebackup/t/008_promote.pl | 81 | ||||
-rw-r--r-- | src/include/access/xlog.h | 1 | ||||
-rw-r--r-- | src/include/postmaster/walsummarizer.h | 3 |
7 files changed, 241 insertions, 110 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index bb2685304e4..7f136026277 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -500,6 +500,11 @@ typedef struct XLogCtlData * If we create a new timeline when the system was started up, * PrevTimeLineID is the old timeline's ID that we forked off from. * Otherwise it's equal to InsertTimeLineID. + * + * We set these fields while holding info_lck. Most that reads these + * values knows that recovery is no longer in progress and so can safely + * read the value without a lock, but code that could be run either during + * or after recovery can take info_lck while reading these values. */ TimeLineID InsertTimeLineID; TimeLineID PrevTimeLineID; @@ -5317,6 +5322,13 @@ CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog, char partialfname[MAXFNAMELEN]; char partialpath[MAXPGPATH]; + /* + * If we're summarizing WAL, we can't rename the partial file + * until the summarizer finishes with it, else it will fail. + */ + if (summarize_wal) + WaitForWalSummarization(EndOfLog); + XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size); snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname); snprintf(partialpath, MAXPGPATH, "%s.partial", origpath); @@ -5947,8 +5959,10 @@ StartupXLOG(void) } /* Save the selected TimeLineID in shared memory, too */ + SpinLockAcquire(&XLogCtl->info_lck); XLogCtl->InsertTimeLineID = newTLI; XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI; + SpinLockRelease(&XLogCtl->info_lck); /* * Actually, if WAL ended in an incomplete record, skip the parts that @@ -6484,6 +6498,25 @@ GetWALInsertionTimeLine(void) } /* + * GetWALInsertionTimeLineIfSet -- If the system is not in recovery, returns + * the WAL insertion timeline; else, returns 0. Wherever possible, use + * GetWALInsertionTimeLine() instead, since it's cheaper. Note that this + * function decides recovery has ended as soon as the insert TLI is set, which + * happens before we set XLogCtl->SharedRecoveryState to RECOVERY_STATE_DONE. + */ +TimeLineID +GetWALInsertionTimeLineIfSet(void) +{ + TimeLineID insertTLI; + + SpinLockAcquire(&XLogCtl->info_lck); + insertTLI = XLogCtl->InsertTimeLineID; + SpinLockRelease(&XLogCtl->info_lck); + + return insertTLI; +} + +/* * GetLastImportantRecPtr -- Returns the LSN of the last important record * inserted. All records not explicitly marked as unimportant are considered * important. diff --git a/src/backend/backup/basebackup_incremental.c b/src/backend/backup/basebackup_incremental.c index cc2e168129a..af361f38a6c 100644 --- a/src/backend/backup/basebackup_incremental.c +++ b/src/backend/backup/basebackup_incremental.c @@ -277,12 +277,6 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, TimeLineID earliest_wal_range_tli = 0; XLogRecPtr earliest_wal_range_start_lsn = InvalidXLogRecPtr; TimeLineID latest_wal_range_tli = 0; - XLogRecPtr summarized_lsn; - XLogRecPtr pending_lsn; - XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr; - int deadcycles = 0; - TimestampTz initial_time, - current_time; Assert(ib->buf.data == NULL); @@ -458,85 +452,13 @@ PrepareForIncrementalBackup(IncrementalBackupInfo *ib, } /* - * Wait for WAL summarization to catch up to the backup start LSN (but - * time out if it doesn't do so quickly enough). + * Wait for WAL summarization to catch up to the backup start LSN. This + * will throw an error if the WAL summarizer appears to be stuck. If WAL + * summarization gets disabled while we're waiting, this will return + * immediately, and we'll error out further down if the WAL summaries are + * incomplete. */ - initial_time = current_time = GetCurrentTimestamp(); - while (1) - { - long timeout_in_ms = 10000; - long elapsed_seconds; - - /* - * Align the wait time to prevent drift. This doesn't really matter, - * but we'd like the warnings about how long we've been waiting to say - * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever - * drifting to something that is not a multiple of ten. - */ - timeout_in_ms -= - TimestampDifferenceMilliseconds(initial_time, current_time) % - timeout_in_ms; - - /* Wait for up to 10 seconds. */ - summarized_lsn = WaitForWalSummarization(backup_state->startpoint, - timeout_in_ms, &pending_lsn); - - /* If WAL summarization has progressed sufficiently, stop waiting. */ - if (summarized_lsn >= backup_state->startpoint) - break; - - /* - * Keep track of the number of cycles during which there has been no - * progression of pending_lsn. If pending_lsn is not advancing, that - * means that not only are no new files appearing on disk, but we're - * not even incorporating new records into the in-memory state. - */ - if (pending_lsn > prior_pending_lsn) - { - prior_pending_lsn = pending_lsn; - deadcycles = 0; - } - else - ++deadcycles; - - /* - * If we've managed to wait for an entire minute without the WAL - * summarizer absorbing a single WAL record, error out; probably - * something is wrong. - * - * We could consider also erroring out if the summarizer is taking too - * long to catch up, but it's not clear what rate of progress would be - * acceptable and what would be too slow. So instead, we just try to - * error out in the case where there's no progress at all. That seems - * likely to catch a reasonable number of the things that can go wrong - * in practice (e.g. the summarizer process is completely hung, say - * because somebody hooked up a debugger to it or something) without - * giving up too quickly when the system is just slow. - */ - if (deadcycles >= 6) - ereport(ERROR, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("WAL summarization is not progressing"), - errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.", - LSN_FORMAT_ARGS(backup_state->startpoint), - LSN_FORMAT_ARGS(summarized_lsn), - LSN_FORMAT_ARGS(pending_lsn)))); - - /* - * Otherwise, just let the user know what's happening. - */ - current_time = GetCurrentTimestamp(); - elapsed_seconds = - TimestampDifferenceMilliseconds(initial_time, current_time) / 1000; - ereport(WARNING, - (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("still waiting for WAL summarization through %X/%X after %ld seconds", - LSN_FORMAT_ARGS(backup_state->startpoint), - elapsed_seconds), - errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.", - LSN_FORMAT_ARGS(summarized_lsn), - LSN_FORMAT_ARGS(pending_lsn)))); - } + WaitForWalSummarization(backup_state->startpoint); /* * Retrieve a list of all WAL summaries on any timeline that overlap with diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c index 6bbc0405107..b62e2c36de9 100644 --- a/src/backend/postmaster/walsummarizer.c +++ b/src/backend/postmaster/walsummarizer.c @@ -650,54 +650,132 @@ SetWalSummarizerLatch(void) } /* - * Wait until WAL summarization reaches the given LSN, but not longer than - * the given timeout. + * Wait until WAL summarization reaches the given LSN, but time out with an + * error if the summarizer seems to be stick. * - * The return value is the first still-unsummarized LSN. If it's greater than - * or equal to the passed LSN, then that LSN was reached. If not, we timed out. - * - * Either way, *pending_lsn is set to the value taken from WalSummarizerCtl. + * Returns immediately if summarize_wal is turned off while we wait. Caller + * is expected to handle this case, if necessary. */ -XLogRecPtr -WaitForWalSummarization(XLogRecPtr lsn, long timeout, XLogRecPtr *pending_lsn) +void +WaitForWalSummarization(XLogRecPtr lsn) { - TimestampTz start_time = GetCurrentTimestamp(); - TimestampTz deadline = TimestampTzPlusMilliseconds(start_time, timeout); - XLogRecPtr summarized_lsn; + TimestampTz initial_time, + cycle_time, + current_time; + XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr; + int deadcycles = 0; - Assert(!XLogRecPtrIsInvalid(lsn)); - Assert(timeout > 0); + initial_time = cycle_time = GetCurrentTimestamp(); while (1) { - TimestampTz now; - long remaining_timeout; + long timeout_in_ms = 10000; + XLogRecPtr summarized_lsn; + XLogRecPtr pending_lsn; + + CHECK_FOR_INTERRUPTS(); + + /* If WAL summarization is disabled while we're waiting, give up. */ + if (!summarize_wal) + return; /* * If the LSN summarized on disk has reached the target value, stop. */ LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); summarized_lsn = WalSummarizerCtl->summarized_lsn; - *pending_lsn = WalSummarizerCtl->pending_lsn; + pending_lsn = WalSummarizerCtl->pending_lsn; LWLockRelease(WALSummarizerLock); + + /* If WAL summarization has progressed sufficiently, stop waiting. */ if (summarized_lsn >= lsn) break; - /* Timeout reached? If yes, stop. */ - now = GetCurrentTimestamp(); - remaining_timeout = TimestampDifferenceMilliseconds(now, deadline); - if (remaining_timeout <= 0) - break; + /* Recheck current time. */ + current_time = GetCurrentTimestamp(); + + /* Have we finished the current cycle of waiting? */ + if (TimestampDifferenceMilliseconds(cycle_time, + current_time) >= timeout_in_ms) + { + long elapsed_seconds; + + /* Begin new wait cycle. */ + cycle_time = TimestampTzPlusMilliseconds(cycle_time, + timeout_in_ms); + + /* + * Keep track of the number of cycles during which there has been + * no progression of pending_lsn. If pending_lsn is not advancing, + * that means that not only are no new files appearing on disk, + * but we're not even incorporating new records into the in-memory + * state. + */ + if (pending_lsn > prior_pending_lsn) + { + prior_pending_lsn = pending_lsn; + deadcycles = 0; + } + else + ++deadcycles; + + /* + * If we've managed to wait for an entire minute without the WAL + * summarizer absorbing a single WAL record, error out; probably + * something is wrong. + * + * We could consider also erroring out if the summarizer is taking + * too long to catch up, but it's not clear what rate of progress + * would be acceptable and what would be too slow. So instead, we + * just try to error out in the case where there's no progress at + * all. That seems likely to catch a reasonable number of the + * things that can go wrong in practice (e.g. the summarizer + * process is completely hung, say because somebody hooked up a + * debugger to it or something) without giving up too quickly when + * the system is just slow. + */ + if (deadcycles >= 6) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("WAL summarization is not progressing"), + errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.", + LSN_FORMAT_ARGS(lsn), + LSN_FORMAT_ARGS(summarized_lsn), + LSN_FORMAT_ARGS(pending_lsn)))); + + + /* + * Otherwise, just let the user know what's happening. + */ + elapsed_seconds = + TimestampDifferenceMilliseconds(initial_time, + current_time) / 1000; + ereport(WARNING, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("still waiting for WAL summarization through %X/%X after %ld seconds", + LSN_FORMAT_ARGS(lsn), + elapsed_seconds), + errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.", + LSN_FORMAT_ARGS(summarized_lsn), + LSN_FORMAT_ARGS(pending_lsn)))); + } + + /* + * Align the wait time to prevent drift. This doesn't really matter, + * but we'd like the warnings about how long we've been waiting to say + * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever + * drifting to something that is not a multiple of ten. + */ + timeout_in_ms -= + TimestampDifferenceMilliseconds(cycle_time, current_time); /* Wait and see. */ ConditionVariableTimedSleep(&WalSummarizerCtl->summary_file_cv, - remaining_timeout, + timeout_in_ms, WAIT_EVENT_WAL_SUMMARY_READY); } ConditionVariableCancelSleep(); - - return summarized_lsn; } /* @@ -730,6 +808,22 @@ GetLatestLSN(TimeLineID *tli) TimeLineID flush_tli; XLogRecPtr replay_lsn; TimeLineID replay_tli; + TimeLineID insert_tli; + + /* + * After the insert TLI has been set and before the control file has + * been updated to show the DB in production, RecoveryInProgress() + * will return true, because it's not yet safe for all backends to + * begin writing WAL. However, replay has already ceased, so from our + * point of view, recovery is already over. We should summarize up to + * where replay stopped and then prepare to resume at the start of the + * insert timeline. + */ + if ((insert_tli = GetWALInsertionTimeLineIfSet()) != 0) + { + *tli = insert_tli; + return GetXLogReplayRecPtr(NULL); + } /* * What we really want to know is how much WAL has been flushed to diff --git a/src/bin/pg_combinebackup/meson.build b/src/bin/pg_combinebackup/meson.build index d871b2e3b80..d142608e949 100644 --- a/src/bin/pg_combinebackup/meson.build +++ b/src/bin/pg_combinebackup/meson.build @@ -35,6 +35,7 @@ tests += { 't/005_integrity.pl', 't/006_db_file_copy.pl', 't/007_wal_level_minimal.pl', + 't/008_promote.pl', ], } } diff --git a/src/bin/pg_combinebackup/t/008_promote.pl b/src/bin/pg_combinebackup/t/008_promote.pl new file mode 100644 index 00000000000..1154a5d8b22 --- /dev/null +++ b/src/bin/pg_combinebackup/t/008_promote.pl @@ -0,0 +1,81 @@ +# Copyright (c) 2021-2024, PostgreSQL Global Development Group +# +# Test whether WAL summaries are complete such that incremental backup +# can be performed after promoting a standby at an arbitrary LSN. + +use strict; +use warnings FATAL => 'all'; +use File::Compare; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Can be changed to test the other modes. +my $mode = $ENV{PG_TEST_PG_COMBINEBACKUP_MODE} || '--copy'; + +note "testing using mode $mode"; + +# Set up a new database instance. +my $node1 = PostgreSQL::Test::Cluster->new('node1'); +$node1->init(has_archiving => 1, allows_streaming => 1); +$node1->append_conf('postgresql.conf', 'summarize_wal = on'); +$node1->append_conf('postgresql.conf', 'log_min_messages = debug1'); +$node1->start; + +# Create a table and insert a test row into it. +$node1->safe_psql('postgres', <<EOM); +CREATE TABLE mytable (a int, b text); +INSERT INTO mytable VALUES (1, 'avocado'); +EOM + +# Take a full backup. +my $backup1path = $node1->backup_dir . '/backup1'; +$node1->command_ok( + [ 'pg_basebackup', '-D', $backup1path, '--no-sync', '-cfast' ], + "full backup from node1"); + +# Checkpoint and record LSN after. +$node1->safe_psql('postgres', 'CHECKPOINT'); +my $lsn = $node1->safe_psql('postgres', 'SELECT pg_current_wal_insert_lsn()'); + +# Insert a second row on the original node. +$node1->safe_psql('postgres', <<EOM); +INSERT INTO mytable VALUES (2, 'beetle'); +EOM + +# Now create a second node. We want this to stream from the first node and +# then stop recovery at some arbitrary LSN, not just when it hits the end of +# WAL, so use a recovery target. +my $node2 = PostgreSQL::Test::Cluster->new('node2'); +$node2->init_from_backup($node1, 'backup1', 'has_streaming' => 1); +$node2->append_conf('postgresql.conf', <<EOM); +recovery_target_lsn = '$lsn' +recovery_target_action = 'pause' +EOM +$node2->start(); + +# Wait until recoveery pauses, then promote. +$node2->poll_query_until('postgres', "SELECT pg_get_wal_replay_pause_state() = 'paused';"); +$node2->safe_psql('postgres', "SELECT pg_promote()"); + +# Once promotion occurs, insert a second row on the new node. +$node2->poll_query_until('postgres', "SELECT pg_is_in_recovery() = 'f';"); +$node2->safe_psql('postgres', <<EOM); +INSERT INTO mytable VALUES (2, 'blackberry'); +EOM + +# Now take an incremental backup. If WAL summarization didn't follow the +# timeline cange correctly, something should break at this point. +my $backup2path = $node1->backup_dir . '/backup2'; +$node2->command_ok( + [ 'pg_basebackup', '-D', $backup2path, '--no-sync', '-cfast', + '--incremental', $backup1path . '/backup_manifest' ], + "incremental backup from node2"); + +# Restore the incremental backup and use it to create a new node. +my $node3 = PostgreSQL::Test::Cluster->new('node3'); +$node3->init_from_backup($node1, 'backup2', + combine_with_prior => [ 'backup1' ]); +$node3->start(); + +done_testing(); diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 1a1f11a943f..2c507ea618c 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -248,6 +248,7 @@ extern XLogRecPtr GetRedoRecPtr(void); extern XLogRecPtr GetInsertRecPtr(void); extern XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI); extern TimeLineID GetWALInsertionTimeLine(void); +extern TimeLineID GetWALInsertionTimeLineIfSet(void); extern XLogRecPtr GetLastImportantRecPtr(void); extern void SetWalWriterSleeping(bool sleeping); diff --git a/src/include/postmaster/walsummarizer.h b/src/include/postmaster/walsummarizer.h index 112bc1e6cba..aedca556764 100644 --- a/src/include/postmaster/walsummarizer.h +++ b/src/include/postmaster/walsummarizer.h @@ -30,7 +30,6 @@ extern void GetWalSummarizerState(TimeLineID *summarized_tli, extern XLogRecPtr GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact); extern void SetWalSummarizerLatch(void); -extern XLogRecPtr WaitForWalSummarization(XLogRecPtr lsn, long timeout, - XLogRecPtr *pending_lsn); +extern void WaitForWalSummarization(XLogRecPtr lsn); #endif |