aboutsummaryrefslogtreecommitdiff
path: root/src/test/recovery
diff options
context:
space:
mode:
Diffstat (limited to 'src/test/recovery')
-rw-r--r--src/test/recovery/meson.build2
-rw-r--r--src/test/recovery/t/003_recovery_targets.pl50
-rw-r--r--src/test/recovery/t/013_crash_restart.pl2
-rw-r--r--src/test/recovery/t/022_crash_temp_files.pl2
-rw-r--r--src/test/recovery/t/032_relfilenode_reuse.pl2
-rw-r--r--src/test/recovery/t/037_invalid_database.pl2
-rw-r--r--src/test/recovery/t/040_standby_failover_slots_sync.pl3
-rw-r--r--src/test/recovery/t/047_checkpoint_physical_slot.pl132
-rw-r--r--src/test/recovery/t/048_vacuum_horizon_floor.pl288
9 files changed, 477 insertions, 6 deletions
diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build
index cb983766c67..6e78ff1a030 100644
--- a/src/test/recovery/meson.build
+++ b/src/test/recovery/meson.build
@@ -54,6 +54,8 @@ tests += {
't/043_no_contrecord_switch.pl',
't/044_invalidate_inactive_slots.pl',
't/045_archive_restartpoint.pl',
+ 't/047_checkpoint_physical_slot.pl',
+ 't/048_vacuum_horizon_floor.pl'
],
},
}
diff --git a/src/test/recovery/t/003_recovery_targets.pl b/src/test/recovery/t/003_recovery_targets.pl
index 0ae2e982727..f2109efa9b1 100644
--- a/src/test/recovery/t/003_recovery_targets.pl
+++ b/src/test/recovery/t/003_recovery_targets.pl
@@ -187,4 +187,54 @@ ok( $logfile =~
qr/FATAL: .* recovery ended before configured recovery target was reached/,
'recovery end before target reached is a fatal error');
+# Invalid timeline target
+$node_standby = PostgreSQL::Test::Cluster->new('standby_9');
+$node_standby->init_from_backup($node_primary, 'my_backup',
+ has_restoring => 1);
+$node_standby->append_conf('postgresql.conf',
+ "recovery_target_timeline = 'bogus'");
+
+$res = run_log(
+ [
+ 'pg_ctl',
+ '--pgdata' => $node_standby->data_dir,
+ '--log' => $node_standby->logfile,
+ 'start',
+ ]);
+ok(!$res, 'invalid timeline target (bogus value)');
+
+my $log_start = $node_standby->wait_for_log("is not a valid number");
+
+# Timeline target out of min range
+$node_standby->append_conf('postgresql.conf',
+ "recovery_target_timeline = '0'");
+
+$res = run_log(
+ [
+ 'pg_ctl',
+ '--pgdata' => $node_standby->data_dir,
+ '--log' => $node_standby->logfile,
+ 'start',
+ ]);
+ok(!$res, 'invalid timeline target (lower bound check)');
+
+$log_start =
+ $node_standby->wait_for_log("must be between 1 and 4294967295", $log_start);
+
+# Timeline target out of max range
+$node_standby->append_conf('postgresql.conf',
+ "recovery_target_timeline = '4294967296'");
+
+$res = run_log(
+ [
+ 'pg_ctl',
+ '--pgdata' => $node_standby->data_dir,
+ '--log' => $node_standby->logfile,
+ 'start',
+ ]);
+ok(!$res, 'invalid timeline target (upper bound check)');
+
+$log_start =
+ $node_standby->wait_for_log("must be between 1 and 4294967295", $log_start);
+
done_testing();
diff --git a/src/test/recovery/t/013_crash_restart.pl b/src/test/recovery/t/013_crash_restart.pl
index 4e60806563f..debfa635c36 100644
--- a/src/test/recovery/t/013_crash_restart.pl
+++ b/src/test/recovery/t/013_crash_restart.pl
@@ -27,7 +27,7 @@ $node->start();
$node->safe_psql(
'postgres',
q[ALTER SYSTEM SET restart_after_crash = 1;
- ALTER SYSTEM SET log_connections = 1;
+ ALTER SYSTEM SET log_connections = receipt;
SELECT pg_reload_conf();]);
# Run psql, keeping session alive, so we have an alive backend to kill.
diff --git a/src/test/recovery/t/022_crash_temp_files.pl b/src/test/recovery/t/022_crash_temp_files.pl
index 50def031c96..0b68860bd3e 100644
--- a/src/test/recovery/t/022_crash_temp_files.pl
+++ b/src/test/recovery/t/022_crash_temp_files.pl
@@ -26,7 +26,7 @@ $node->start();
$node->safe_psql(
'postgres',
q[ALTER SYSTEM SET remove_temp_files_after_crash = on;
- ALTER SYSTEM SET log_connections = 1;
+ ALTER SYSTEM SET log_connections = receipt;
ALTER SYSTEM SET work_mem = '64kB';
ALTER SYSTEM SET restart_after_crash = on;
SELECT pg_reload_conf();]);
diff --git a/src/test/recovery/t/032_relfilenode_reuse.pl b/src/test/recovery/t/032_relfilenode_reuse.pl
index 492ef115ba4..0c44883cc34 100644
--- a/src/test/recovery/t/032_relfilenode_reuse.pl
+++ b/src/test/recovery/t/032_relfilenode_reuse.pl
@@ -14,7 +14,7 @@ $node_primary->init(allows_streaming => 1);
$node_primary->append_conf(
'postgresql.conf', q[
allow_in_place_tablespaces = true
-log_connections=on
+log_connections=receipt
# to avoid "repairing" corruption
full_page_writes=off
log_min_messages=debug2
diff --git a/src/test/recovery/t/037_invalid_database.pl b/src/test/recovery/t/037_invalid_database.pl
index bdf39397397..dc52c55c7af 100644
--- a/src/test/recovery/t/037_invalid_database.pl
+++ b/src/test/recovery/t/037_invalid_database.pl
@@ -15,7 +15,7 @@ $node->append_conf(
autovacuum = off
max_prepared_transactions=5
log_min_duration_statement=0
-log_connections=on
+log_connections=receipt
log_disconnections=on
));
diff --git a/src/test/recovery/t/040_standby_failover_slots_sync.pl b/src/test/recovery/t/040_standby_failover_slots_sync.pl
index 9c8b49e942d..2c61c51e914 100644
--- a/src/test/recovery/t/040_standby_failover_slots_sync.pl
+++ b/src/test/recovery/t/040_standby_failover_slots_sync.pl
@@ -941,8 +941,7 @@ is( $standby1->safe_psql(
'synced slot retained on the new primary');
# Commit the prepared transaction
-$standby1->safe_psql('postgres',
- "COMMIT PREPARED 'test_twophase_slotsync';");
+$standby1->safe_psql('postgres', "COMMIT PREPARED 'test_twophase_slotsync';");
$standby1->wait_for_catchup('regress_mysub1');
# Confirm that the prepared transaction is replicated to the subscriber
diff --git a/src/test/recovery/t/047_checkpoint_physical_slot.pl b/src/test/recovery/t/047_checkpoint_physical_slot.pl
new file mode 100644
index 00000000000..a1332b5d44c
--- /dev/null
+++ b/src/test/recovery/t/047_checkpoint_physical_slot.pl
@@ -0,0 +1,132 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+#
+# This test verifies the case when the physical slot is advanced during
+# checkpoint. The test checks that the physical slot's restart_lsn still refers
+# to an existed WAL segment after immediate restart.
+#
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+
+use Test::More;
+
+if ($ENV{enable_injection_points} ne 'yes')
+{
+ plan skip_all => 'Injection points not supported by this build';
+}
+
+my ($node, $result);
+
+$node = PostgreSQL::Test::Cluster->new('mike');
+$node->init;
+$node->append_conf('postgresql.conf', "wal_level = 'replica'");
+$node->start;
+
+# Check if the extension injection_points is available, as it may be
+# possible that this script is run with installcheck, where the module
+# would not be installed by default.
+if (!$node->check_extension('injection_points'))
+{
+ plan skip_all => 'Extension injection_points not installed';
+}
+
+$node->safe_psql('postgres', q(CREATE EXTENSION injection_points));
+
+# Create a physical replication slot.
+$node->safe_psql('postgres',
+ q{select pg_create_physical_replication_slot('slot_physical', true)});
+
+# Advance slot to the current position, just to have everything "valid".
+$node->safe_psql('postgres',
+ q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())}
+);
+
+# Run checkpoint to flush current state to disk and set a baseline.
+$node->safe_psql('postgres', q{checkpoint});
+
+# Insert 2M rows; that's about 260MB (~20 segments) worth of WAL.
+$node->advance_wal(20);
+
+# Advance slot to the current position, just to have everything "valid".
+$node->safe_psql('postgres',
+ q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())}
+);
+
+# Run another checkpoint to set a new restore LSN.
+$node->safe_psql('postgres', q{checkpoint});
+
+# Another 2M rows; that's about 260MB (~20 segments) worth of WAL.
+$node->advance_wal(20);
+
+my $restart_lsn_init = $node->safe_psql('postgres',
+ q{select restart_lsn from pg_replication_slots where slot_name = 'slot_physical'}
+);
+chomp($restart_lsn_init);
+note("restart lsn before checkpoint: $restart_lsn_init");
+
+# Run another checkpoint, this time in the background, and make it wait
+# on the injection point) so that the checkpoint stops right before
+# removing old WAL segments.
+note('starting checkpoint');
+
+my $checkpoint = $node->background_psql('postgres');
+$checkpoint->query_safe(
+ q{select injection_points_attach('checkpoint-before-old-wal-removal','wait')}
+);
+$checkpoint->query_until(
+ qr/starting_checkpoint/,
+ q(\echo starting_checkpoint
+checkpoint;
+\q
+));
+
+# Wait until the checkpoint stops right before removing WAL segments.
+note('waiting for injection_point');
+$node->wait_for_event('checkpointer', 'checkpoint-before-old-wal-removal');
+note('injection_point is reached');
+
+# OK, we're in the right situation: time to advance the physical slot, which
+# recalculates the required LSN and then unblock the checkpoint, which
+# removes the WAL still needed by the physical slot.
+$node->safe_psql('postgres',
+ q{select pg_replication_slot_advance('slot_physical', pg_current_wal_lsn())}
+);
+
+# Continue the checkpoint.
+$node->safe_psql('postgres',
+ q{select injection_points_wakeup('checkpoint-before-old-wal-removal')});
+
+my $restart_lsn_old = $node->safe_psql('postgres',
+ q{select restart_lsn from pg_replication_slots where slot_name = 'slot_physical'}
+);
+chomp($restart_lsn_old);
+note("restart lsn before stop: $restart_lsn_old");
+
+# Abruptly stop the server (1 second should be enough for the checkpoint
+# to finish; it would be better).
+$node->stop('immediate');
+
+$node->start;
+
+# Get the restart_lsn of the slot right after restarting.
+my $restart_lsn = $node->safe_psql('postgres',
+ q{select restart_lsn from pg_replication_slots where slot_name = 'slot_physical'}
+);
+chomp($restart_lsn);
+note("restart lsn: $restart_lsn");
+
+# Get the WAL segment name for the slot's restart_lsn.
+my $restart_lsn_segment = $node->safe_psql('postgres',
+ "SELECT pg_walfile_name('$restart_lsn'::pg_lsn)");
+chomp($restart_lsn_segment);
+
+# Check if the required wal segment exists.
+note("required by slot segment name: $restart_lsn_segment");
+my $datadir = $node->data_dir;
+ok( -f "$datadir/pg_wal/$restart_lsn_segment",
+ "WAL segment $restart_lsn_segment for physical slot's restart_lsn $restart_lsn exists"
+);
+
+done_testing();
diff --git a/src/test/recovery/t/048_vacuum_horizon_floor.pl b/src/test/recovery/t/048_vacuum_horizon_floor.pl
new file mode 100644
index 00000000000..e56fce59d58
--- /dev/null
+++ b/src/test/recovery/t/048_vacuum_horizon_floor.pl
@@ -0,0 +1,288 @@
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use Test::More;
+
+# Test that vacuum prunes away all dead tuples killed before OldestXmin
+#
+# This test creates a table on a primary, updates the table to generate dead
+# tuples for vacuum, and then, during the vacuum, uses the replica to force
+# GlobalVisState->maybe_needed on the primary to move backwards and precede
+# the value of OldestXmin set at the beginning of vacuuming the table.
+
+# Set up nodes
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 'physical');
+
+# io_combine_limit is set to 1 to avoid pinning more than one buffer at a time
+# to ensure test determinism.
+$node_primary->append_conf(
+ 'postgresql.conf', qq[
+hot_standby_feedback = on
+autovacuum = off
+log_min_messages = INFO
+maintenance_work_mem = 64
+io_combine_limit = 1
+]);
+$node_primary->start;
+
+my $node_replica = PostgreSQL::Test::Cluster->new('standby');
+
+$node_primary->backup('my_backup');
+$node_replica->init_from_backup($node_primary, 'my_backup',
+ has_streaming => 1);
+
+$node_replica->start;
+
+my $test_db = "test_db";
+$node_primary->safe_psql('postgres', "CREATE DATABASE $test_db");
+
+# Save the original connection info for later use
+my $orig_conninfo = $node_primary->connstr();
+
+my $table1 = "vac_horizon_floor_table";
+
+# Long-running Primary Session A
+my $psql_primaryA =
+ $node_primary->background_psql($test_db, on_error_stop => 1);
+
+# Long-running Primary Session B
+my $psql_primaryB =
+ $node_primary->background_psql($test_db, on_error_stop => 1);
+
+# Our test relies on two rounds of index vacuuming for reasons elaborated
+# later. To trigger two rounds of index vacuuming, we must fill up the
+# TIDStore with dead items partway through a vacuum of the table. The number
+# of rows is just enough to ensure we exceed maintenance_work_mem on all
+# supported platforms, while keeping test runtime as short as we can.
+my $nrows = 2000;
+
+# Because vacuum's first pass, pruning, is where we use the GlobalVisState to
+# check tuple visibility, GlobalVisState->maybe_needed must move backwards
+# during pruning before checking the visibility for a tuple which would have
+# been considered HEAPTUPLE_DEAD prior to maybe_needed moving backwards but
+# HEAPTUPLE_RECENTLY_DEAD compared to the new, older value of maybe_needed.
+#
+# We must not only force the horizon on the primary to move backwards but also
+# force the vacuuming backend's GlobalVisState to be updated. GlobalVisState
+# is forced to update during index vacuuming.
+#
+# _bt_pendingfsm_finalize() calls GetOldestNonRemovableTransactionId() at the
+# end of a round of index vacuuming, updating the backend's GlobalVisState
+# and, in our case, moving maybe_needed backwards.
+#
+# Then vacuum's first (pruning) pass will continue and pruning will find our
+# later inserted and updated tuple HEAPTUPLE_RECENTLY_DEAD when compared to
+# maybe_needed but HEAPTUPLE_DEAD when compared to OldestXmin.
+#
+# Thus, we must force at least two rounds of index vacuuming to ensure that
+# some tuple visibility checks will happen after a round of index vacuuming.
+# To accomplish this, we set maintenance_work_mem to its minimum value and
+# insert and delete enough rows that we force at least one round of index
+# vacuuming before getting to a dead tuple which was killed after the standby
+# is disconnected.
+$node_primary->safe_psql(
+ $test_db, qq[
+ CREATE TABLE ${table1}(col1 int)
+ WITH (autovacuum_enabled=false, fillfactor=10);
+ INSERT INTO $table1 VALUES(7);
+ INSERT INTO $table1 SELECT generate_series(1, $nrows) % 3;
+ CREATE INDEX on ${table1}(col1);
+ DELETE FROM $table1 WHERE col1 = 0;
+ INSERT INTO $table1 VALUES(7);
+]);
+
+# We will later move the primary forward while the standby is disconnected.
+# For now, however, there is no reason not to wait for the standby to catch
+# up.
+my $primary_lsn = $node_primary->lsn('flush');
+$node_primary->wait_for_catchup($node_replica, 'replay', $primary_lsn);
+
+# Test that the WAL receiver is up and running.
+$node_replica->poll_query_until(
+ $test_db, qq[
+ SELECT EXISTS (SELECT * FROM pg_stat_wal_receiver);], 't');
+
+# Set primary_conninfo to something invalid on the replica and reload the
+# config. Once the config is reloaded, the startup process will force the WAL
+# receiver to restart and it will be unable to reconnect because of the
+# invalid connection information.
+$node_replica->safe_psql(
+ $test_db, qq[
+ ALTER SYSTEM SET primary_conninfo = '';
+ SELECT pg_reload_conf();
+ ]);
+
+# Wait until the WAL receiver has shut down and been unable to start up again.
+$node_replica->poll_query_until(
+ $test_db, qq[
+ SELECT EXISTS (SELECT * FROM pg_stat_wal_receiver);], 'f');
+
+# Now insert and update a tuple which will be visible to the vacuum on the
+# primary but which will have xmax newer than the oldest xmin on the standby
+# that was recently disconnected.
+my $res = $psql_primaryA->query_safe(
+ qq[
+ INSERT INTO $table1 VALUES (99);
+ UPDATE $table1 SET col1 = 100 WHERE col1 = 99;
+ SELECT 'after_update';
+ ]
+);
+
+# Make sure the UPDATE finished
+like($res, qr/^after_update$/m, "UPDATE occurred on primary session A");
+
+# Open a cursor on the primary whose pin will keep VACUUM from getting a
+# cleanup lock on the first page of the relation. We want VACUUM to be able to
+# start, calculate initial values for OldestXmin and GlobalVisState and then
+# be unable to proceed with pruning our dead tuples. This will allow us to
+# reconnect the standby and push the horizon back before we start actual
+# pruning and vacuuming.
+my $primary_cursor1 = "vac_horizon_floor_cursor1";
+
+# The first value inserted into the table was a 7, so FETCH FORWARD should
+# return a 7. That's how we know the cursor has a pin.
+# Disable index scans so the cursor pins heap pages and not index pages.
+$res = $psql_primaryB->query_safe(
+ qq[
+ BEGIN;
+ SET enable_bitmapscan = off;
+ SET enable_indexscan = off;
+ SET enable_indexonlyscan = off;
+ DECLARE $primary_cursor1 CURSOR FOR SELECT * FROM $table1 WHERE col1 = 7;
+ FETCH $primary_cursor1;
+ ]
+);
+
+is($res, 7, qq[Cursor query returned $res. Expected value 7.]);
+
+# Get the PID of the session which will run the VACUUM FREEZE so that we can
+# use it to filter pg_stat_activity later.
+my $vacuum_pid = $psql_primaryA->query_safe("SELECT pg_backend_pid();");
+
+# Now start a VACUUM FREEZE on the primary. It will call vacuum_get_cutoffs()
+# and establish values of OldestXmin and GlobalVisState which are newer than
+# all of our dead tuples. Then it will be unable to get a cleanup lock to
+# start pruning, so it will hang.
+#
+# We use VACUUM FREEZE because it will wait for a cleanup lock instead of
+# skipping the page pinned by the cursor. Note that works because the target
+# tuple's xmax precedes OldestXmin which ensures that lazy_scan_noprune() will
+# return false and we will wait for the cleanup lock.
+#
+# Disable any prefetching, parallelism, or other concurrent I/O by vacuum. The
+# pages of the heap must be processed in order by a single worker to ensure
+# test stability (PARALLEL 0 shouldn't be necessary but guards against the
+# possibility of parallel heap vacuuming).
+$psql_primaryA->{stdin} .= qq[
+ SET maintenance_io_concurrency = 0;
+ VACUUM (VERBOSE, FREEZE, PARALLEL 0) $table1;
+ \\echo VACUUM
+ ];
+
+# Make sure the VACUUM command makes it to the server.
+$psql_primaryA->{run}->pump_nb();
+
+# Make sure that the VACUUM has already called vacuum_get_cutoffs() and is
+# just waiting on the lock to start vacuuming. We don't want the standby to
+# re-establish a connection to the primary and push the horizon back until
+# we've saved initial values in GlobalVisState and calculated OldestXmin.
+$node_primary->poll_query_until(
+ $test_db,
+ qq[
+ SELECT count(*) >= 1 FROM pg_stat_activity
+ WHERE pid = $vacuum_pid
+ AND wait_event = 'BufferPin';
+ ],
+ 't');
+
+# Ensure the WAL receiver is still not active on the replica.
+$node_replica->poll_query_until(
+ $test_db, qq[
+ SELECT EXISTS (SELECT * FROM pg_stat_wal_receiver);], 'f');
+
+# Allow the WAL receiver connection to re-establish.
+$node_replica->safe_psql(
+ $test_db, qq[
+ ALTER SYSTEM SET primary_conninfo = '$orig_conninfo';
+ SELECT pg_reload_conf();
+ ]);
+
+# Ensure the new WAL receiver has connected.
+$node_replica->poll_query_until(
+ $test_db, qq[
+ SELECT EXISTS (SELECT * FROM pg_stat_wal_receiver);], 't');
+
+# Once the WAL sender is shown on the primary, the replica should have
+# connected with the primary and pushed the horizon backward. Primary Session
+# A won't see that until the VACUUM FREEZE proceeds and does its first round
+# of index vacuuming.
+$node_primary->poll_query_until(
+ $test_db, qq[
+ SELECT EXISTS (SELECT * FROM pg_stat_replication);], 't');
+
+# Move the cursor forward to the next 7. We inserted the 7 much later, so
+# advancing the cursor should allow vacuum to proceed vacuuming most pages of
+# the relation. Because we set maintanence_work_mem sufficiently low, we
+# expect that a round of index vacuuming has happened and that the vacuum is
+# now waiting for the cursor to release its pin on the last page of the
+# relation.
+$res = $psql_primaryB->query_safe("FETCH $primary_cursor1");
+is($res, 7,
+ qq[Cursor query returned $res from second fetch. Expected value 7.]);
+
+# Prevent the test from incorrectly passing by confirming that we did indeed
+# do a pass of index vacuuming.
+$node_primary->poll_query_until(
+ $test_db, qq[
+ SELECT index_vacuum_count > 0
+ FROM pg_stat_progress_vacuum
+ WHERE datname='$test_db' AND relid::regclass = '$table1'::regclass;
+ ], 't');
+
+# Commit the transaction with the open cursor so that the VACUUM can finish.
+$psql_primaryB->query_until(
+ qr/^commit$/m,
+ qq[
+ COMMIT;
+ \\echo commit
+ ]
+);
+
+# VACUUM proceeds with pruning and does a visibility check on each tuple. In
+# older versions of Postgres, pruning found our final dead tuple
+# non-removable (HEAPTUPLE_RECENTLY_DEAD) since its xmax is after the new
+# value of maybe_needed. Then heap_prepare_freeze_tuple() would decide the
+# tuple xmax should be frozen because it precedes OldestXmin. Vacuum would
+# then error out in heap_pre_freeze_checks() with "cannot freeze committed
+# xmax". This was fixed by changing pruning to find all
+# HEAPTUPLE_RECENTLY_DEAD tuples with xmaxes preceding OldestXmin
+# HEAPTUPLE_DEAD and removing them.
+
+# With the fix, VACUUM should finish successfully, incrementing the table
+# vacuum_count.
+$node_primary->poll_query_until(
+ $test_db,
+ qq[
+ SELECT vacuum_count > 0
+ FROM pg_stat_all_tables WHERE relname = '${table1}';
+ ]
+ , 't');
+
+$primary_lsn = $node_primary->lsn('flush');
+
+# Make sure something causes us to flush
+$node_primary->safe_psql($test_db, "INSERT INTO $table1 VALUES (1);");
+
+# Nothing on the replica should cause a recovery conflict, so this should
+# finish successfully.
+$node_primary->wait_for_catchup($node_replica, 'replay', $primary_lsn);
+
+## Shut down psqls
+$psql_primaryA->quit;
+$psql_primaryB->quit;
+
+$node_replica->stop();
+$node_primary->stop();
+
+done_testing();