aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Paquier <michael@paquier.xyz>2024-01-23 14:46:01 +0900
committerMichael Paquier <michael@paquier.xyz>2024-01-23 14:46:01 +0900
commit46d8587b504170ca14f064890bc7ccbbd7523f81 (patch)
tree053498d5060e795a015de24bec74c605c0fdca27
parentb262ad440edecda0b1aba81d967ab560a83acb8a (diff)
downloadpostgresql-46d8587b504170ca14f064890bc7ccbbd7523f81.tar.gz
postgresql-46d8587b504170ca14f064890bc7ccbbd7523f81.zip
Improve stability of recovery test 035_standby_logical_decoding
This commit tweaks a couple of things in 035_standby_logical_decoding to hopefully stabilize it: - Autovacuum is now disabled, as it could hold a global xmin with a transaction. - Conflicts are generated with command sequences that removed rows (on catalogs, shared or non-shared, or just plain relations) followed by a VACUUM. This was unstable because these did not check that the horizon moved between the SQL commands and the VACUUM. The logic is refactored as follows, to ensure that VACUUM removes dead rows before testing for slot invalidation on a standby (idea suggested by Andres Freund): -- Grab the current horizon. -- Launch SQL commands removing rows. -- Check that the snapshot horizon has been updated. -- Launch VACUUM on the relation whose rows have been removed by the first step. Note that there are still some issues because of standby snapshot WAL records generated by the bgwriter, but this makes the test much more stable. Per reports from buildfarm members dikkop and skink, with analysis and tests from Alexander Lakhin. While on it, fix a couple of incorrect comments. Author: Bertrand Drouvot Reviewed-by: Alexander Lakhin, Michael Paquier Discussion: https://postgr.es/m/OSZPR01MB6310ED3CEDB531BCEDBC6AF2FD479@OSZPR01MB6310.jpnprd01.prod.outlook.com Discussion: https://postgr.es/m/bf67e076-b163-9ba3-4ade-b9fc51a3a8f6@gmail.com Backpatch-through: 16
-rw-r--r--src/test/recovery/t/035_standby_logical_decoding.pl90
1 files changed, 56 insertions, 34 deletions
diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl
index 8bc39a5f033..cebfa52d0fc 100644
--- a/src/test/recovery/t/035_standby_logical_decoding.pl
+++ b/src/test/recovery/t/035_standby_logical_decoding.pl
@@ -238,6 +238,40 @@ sub check_for_invalidation
) or die "Timed out waiting confl_active_logicalslot to be updated";
}
+# Launch $sql query, wait for a new snapshot that has a newer horizon and
+# launch a VACUUM. $vac_option is the set of options to be passed to the
+# VACUUM command, $sql the sql to launch before triggering the vacuum and
+# $to_vac the relation to vacuum.
+#
+# Note that pg_current_snapshot() is used to get the horizon. It does
+# not generate a Transaction/COMMIT WAL record, decreasing the risk of
+# seeing a xl_running_xacts that would advance an active replication slot's
+# catalog_xmin. Advancing the active replication slot's catalog_xmin
+# would break some tests that expect the active slot to conflict with
+# the catalog xmin horizon.
+sub wait_until_vacuum_can_remove
+{
+ my ($vac_option, $sql, $to_vac) = @_;
+
+ # Get the current xid horizon,
+ my $xid_horizon = $node_primary->safe_psql('testdb',
+ qq[select pg_snapshot_xmin(pg_current_snapshot());]);
+
+ # Launch our sql.
+ $node_primary->safe_psql('testdb', qq[$sql]);
+
+ # Wait until we get a newer horizon.
+ $node_primary->poll_query_until('testdb',
+ "SELECT (select pg_snapshot_xmin(pg_current_snapshot())::text::int - $xid_horizon) > 0"
+ ) or die "new snapshot does not have a newer horizon";
+
+ # Launch the vacuum command and insert into flush_wal (see CREATE
+ # TABLE flush_wal for the reason).
+ $node_primary->safe_psql(
+ 'testdb', qq[VACUUM $vac_option verbose $to_vac;
+ INSERT INTO flush_wal DEFAULT VALUES;]);
+}
+
########################
# Initialize primary node
########################
@@ -248,6 +282,7 @@ $node_primary->append_conf(
wal_level = 'logical'
max_replication_slots = 4
max_wal_senders = 4
+autovacuum = off
});
$node_primary->dump_info;
$node_primary->start;
@@ -468,13 +503,9 @@ reactive_slots_change_hfs_and_wait_for_xmins('behaves_ok_', 'vacuum_full_',
0, 1);
# This should trigger the conflict
-$node_primary->safe_psql(
- 'testdb', qq[
- CREATE TABLE conflict_test(x integer, y text);
- DROP TABLE conflict_test;
- VACUUM full pg_class;
- INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
-]);
+wait_until_vacuum_can_remove(
+ 'full', 'CREATE TABLE conflict_test(x integer, y text);
+ DROP TABLE conflict_test;', 'pg_class');
$node_primary->wait_for_replay_catchup($node_standby);
@@ -545,18 +576,15 @@ ok(!-f "$standby_walfile",
my $logstart = -s $node_standby->logfile;
# One way to produce recovery conflict is to create/drop a relation and
-# launch a vacuum on pg_class with hot_standby_feedback turned off on the standby.
+# launch a vacuum on pg_class with hot_standby_feedback turned off on the
+# standby.
reactive_slots_change_hfs_and_wait_for_xmins('vacuum_full_', 'row_removal_',
0, 1);
# This should trigger the conflict
-$node_primary->safe_psql(
- 'testdb', qq[
- CREATE TABLE conflict_test(x integer, y text);
- DROP TABLE conflict_test;
- VACUUM pg_class;
- INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
-]);
+wait_until_vacuum_can_remove(
+ '', 'CREATE TABLE conflict_test(x integer, y text);
+ DROP TABLE conflict_test;', 'pg_class');
$node_primary->wait_for_replay_catchup($node_standby);
@@ -582,19 +610,16 @@ check_pg_recvlogical_stderr($handle,
# get the position to search from in the standby logfile
$logstart = -s $node_standby->logfile;
-# One way to produce recovery conflict is to create/drop a relation and
-# launch a vacuum on pg_class with hot_standby_feedback turned off on the standby.
+# One way to produce recovery conflict on a shared catalog table is to
+# create/drop a role and launch a vacuum on pg_authid with
+# hot_standby_feedback turned off on the standby.
reactive_slots_change_hfs_and_wait_for_xmins('row_removal_',
'shared_row_removal_', 0, 1);
# Trigger the conflict
-$node_primary->safe_psql(
- 'testdb', qq[
- CREATE ROLE create_trash;
- DROP ROLE create_trash;
- VACUUM pg_authid;
- INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
-]);
+wait_until_vacuum_can_remove(
+ '', 'CREATE ROLE create_trash;
+ DROP ROLE create_trash;', 'pg_authid');
$node_primary->wait_for_replay_catchup($node_standby);
@@ -625,14 +650,11 @@ reactive_slots_change_hfs_and_wait_for_xmins('shared_row_removal_',
'no_conflict_', 0, 1);
# This should not trigger a conflict
-$node_primary->safe_psql(
- 'testdb', qq[
- CREATE TABLE conflict_test(x integer, y text);
- INSERT INTO conflict_test(x,y) SELECT s, s::text FROM generate_series(1,4) s;
- UPDATE conflict_test set x=1, y=1;
- VACUUM conflict_test;
- INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
-]);
+wait_until_vacuum_can_remove(
+ '', 'CREATE TABLE conflict_test(x integer, y text);
+ INSERT INTO conflict_test(x,y) SELECT s, s::text FROM generate_series(1,4) s;
+ UPDATE conflict_test set x=1, y=1;', 'conflict_test');
+
$node_primary->wait_for_replay_catchup($node_standby);
# message should not be issued
@@ -671,7 +693,7 @@ $node_standby->restart;
##################################################
# Recovery conflict: Invalidate conflicting slots, including in-use slots
-# Scenario 4: conflict due to on-access pruning.
+# Scenario 5: conflict due to on-access pruning.
##################################################
# get the position to search from in the standby logfile
@@ -711,7 +733,7 @@ change_hot_standby_feedback_and_wait_for_xmins(1, 1);
##################################################
# Recovery conflict: Invalidate conflicting slots, including in-use slots
-# Scenario 5: incorrect wal_level on primary.
+# Scenario 6: incorrect wal_level on primary.
##################################################
# get the position to search from in the standby logfile