aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Paquier <michael@paquier.xyz>2024-03-06 14:38:25 +0900
committerMichael Paquier <michael@paquier.xyz>2024-03-06 14:39:40 +0900
commit08a52ab151ca599406883768cdc08b7929e516de (patch)
tree095d96a2772afb6c0a509233ca9b3a416c90dedd
parentd93627bcbe5001750e7611f0e637200e2d81dcff (diff)
downloadpostgresql-08a52ab151ca599406883768cdc08b7929e516de.tar.gz
postgresql-08a52ab151ca599406883768cdc08b7929e516de.zip
Add recovery TAP test for race condition with slot invalidations
This commit adds a recovery test to provide coverage for the bug fixed in 818fefd8fd, using an injection point to wait just after the process of an active slot is killed. The trick is to give enough time for effective_xmin and effective_catalog_xmin to advance so as the slot invalidation robustness can be checked since the active process is killed without holding its slot's mutex for a short time. Author: Bertrand Drouvot Discussion: https://postgr.es/m/ZdyZya4YrNapWKqz@ip-10-97-1-34.eu-west-3.compute.internal
-rw-r--r--src/backend/replication/slot.c9
-rw-r--r--src/test/recovery/t/035_standby_logical_decoding.pl116
2 files changed, 123 insertions, 2 deletions
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 2614f98ddd2..02ae27499b5 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -53,6 +53,7 @@
#include "storage/proc.h"
#include "storage/procarray.h"
#include "utils/builtins.h"
+#include "utils/injection_point.h"
/*
* Replication slot on-disk data structure.
@@ -1658,6 +1659,14 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlotInvalidationCause cause,
last_signaled_pid = active_pid;
terminated = true;
conflict_prev = conflict;
+
+ /*
+ * This injection point needs to be after kill() to ensure
+ * that the slot is not "active" anymore. It also has to be
+ * after ReportSlotInvalidation() to ensure that the
+ * invalidation message is logged.
+ */
+ INJECTION_POINT("terminate-process-holding-slot");
}
/* Wait until the slot is released. */
diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl
index 0710bccc176..2659d4bb52e 100644
--- a/src/test/recovery/t/035_standby_logical_decoding.pl
+++ b/src/test/recovery/t/035_standby_logical_decoding.pl
@@ -733,14 +733,126 @@ change_hot_standby_feedback_and_wait_for_xmins(1, 1);
##################################################
# Recovery conflict: Invalidate conflicting slots, including in-use slots
-# Scenario 6: incorrect wal_level on primary.
+# Scenario 6: Race condition between slot invalidation and active process
+# termination.
+##################################################
+SKIP:
+{
+ skip "Injection points not supported by this build", 1
+ if ($ENV{enable_injection_points} ne 'yes');
+
+ # Get the position to search from in the standby logfile
+ $logstart = -s $node_standby->logfile;
+
+ # Drop the slots, re-create them, change hot_standby_feedback,
+ # check xmin and catalog_xmin values, make slot active and reset stat.
+ reactive_slots_change_hfs_and_wait_for_xmins('pruning_', 'injection_', 0,
+ 1);
+
+ # Create the injection_points extension.
+ $node_primary->safe_psql('testdb', 'CREATE EXTENSION injection_points;');
+
+ # Wait until the extension has been created on the standby.
+ $node_primary->wait_for_replay_catchup($node_standby);
+
+ # Attach the injection point.
+ $node_standby->safe_psql('testdb',
+ "SELECT injection_points_attach('terminate-process-holding-slot', 'wait');"
+ );
+
+ # Trigger a conflict and insert an XLOG_RUNNING_XACTS before triggering
+ # the vacuum.
+ $node_primary->safe_psql(
+ 'testdb', qq[CREATE TABLE injection_test(x integer);
+ DROP TABLE injection_test;
+ SELECT pg_log_standby_snapshot();]);
+
+ # Now launch the vacuum.
+ wait_until_vacuum_can_remove('',
+ 'CREATE TABLE injection_test2(x integer);', 'pg_class');
+
+ # Wait until the startup process hits the injection point by looking at
+ # pg_stat_activity; the termination LOG message has been emitted and
+ # the process has been killed once we wait at the injection point.
+ $node_standby->wait_for_event('startup',
+ 'terminate-process-holding-slot');
+
+ # Note: $node_primary->wait_for_replay_catchup($node_standby) would be
+ # hanging here due to the injection point, so check the log instead.
+
+ ok( $node_standby->log_contains(
+ "terminating process .* to release replication slot \"injection_activeslot\"",
+ $logstart),
+ "terminating process holding the active slot is logged with injection point"
+ );
+
+ # Extract xid_horizon from the logfile.
+ my $log_contents = slurp_file($node_standby->logfile, $logstart);
+ (my $xid_horizon) =
+ $log_contents =~ m/The slot conflicted with xid horizon ([0-9]*)./
+ or die "could not get xid horizon";
+
+ # Ensure the slot is not active.
+ $node_standby->poll_query_until('testdb',
+ "SELECT active_pid is NULL from pg_catalog.pg_replication_slots where slot_name = 'injection_activeslot'"
+ ) or die "injection_activeslot is still active";
+
+ # Decode changes from the slot to reach
+ # LogicalConfirmReceivedLocation().
+ $node_standby->safe_psql('testdb',
+ qq[SELECT pg_logical_slot_get_changes('injection_activeslot', NULL, NULL);]
+ );
+
+ # Wait until catalog_xmin advances after the xid_horizon. A conflict
+ # reason has to be reported.
+ $node_standby->poll_query_until('testdb',
+ "SELECT (SELECT catalog_xmin::text::int - $xid_horizon from pg_catalog.pg_replication_slots where slot_name = 'injection_activeslot') > 0"
+ ) or die "catalog_xmin did not advance";
+
+ # Get the position to search from in the standby logfile.
+ $logstart = -s $node_standby->logfile;
+
+ # Wakeup the injection point.
+ $node_standby->safe_psql('testdb',
+ "SELECT injection_points_wakeup('terminate-process-holding-slot');");
+
+ # Wait for the standby to catchup.
+ $node_primary->wait_for_replay_catchup($node_standby);
+
+ # Check invalidation in the logfile for the active slot.
+ ok( $node_standby->log_contains(
+ "invalidating obsolete replication slot \"injection_activeslot\"",
+ $logstart),
+ "activeslot slot invalidation is logged with injection point");
+
+ # Verify conflict_reason is 'rows_removed' in pg_replication_slots.
+ check_slots_conflict_reason('injection_', 'rows_removed');
+
+ # Detach from the injection point
+ $node_standby->safe_psql('testdb',
+ "SELECT injection_points_detach('terminate-process-holding-slot');");
+
+ # Turn hot_standby_feedback back on
+ change_hot_standby_feedback_and_wait_for_xmins(1, 1);
+}
+
+##################################################
+# Recovery conflict: Invalidate conflicting slots, including in-use slots
+# Scenario 7: incorrect wal_level on primary.
##################################################
# get the position to search from in the standby logfile
$logstart = -s $node_standby->logfile;
# drop the logical slots
-drop_logical_slots('pruning_');
+if ($ENV{enable_injection_points} ne 'yes')
+{
+ drop_logical_slots('pruning_');
+}
+else
+{
+ drop_logical_slots('injection_');
+}
# create the logical slots
create_logical_slots($node_standby, 'wal_level_');