Add retries for further investigation of 019_replslot_limit.pl failures.

Tom noticed evidence in the buildfarm suggesting the failures might just be really slow process exits. To investigate further, instead of giving up after seeing multiple walsender pids once, retry. For now continue to report test failure if a retry succeeds. See also commit afdeff10526 and fe0972ee5e6. Per suggestion from Tom Lane. Discussion: https://postgr.es/m/3042597.1648148740@sss.pgh.pa.us
author: Andres Freund <andres@anarazel.de> 2022-03-24 17:12:09 -0700
committer: Andres Freund <andres@anarazel.de> 2022-03-24 17:12:09 -0700
commit: f28bf667f602f6ff36c219eb40c5f61de4440ae5 (patch)
tree: 7a3ceb194e103af732361abbd7ac362d40fd77b6 /src
parent: 26ebb0e28032283f99bf985fb47ea3d19fbaf91a (diff)
download: postgresql-f28bf667f602f6ff36c219eb40c5f61de4440ae5.tar.gz
postgresql-f28bf667f602f6ff36c219eb40c5f61de4440ae5.zip
1 files changed, 29 insertions, 11 deletions
diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl
index f62b7b32f66..77bb401bc5f 100644
--- a/src/test/recovery/t/019_replslot_limit.pl
+++ b/src/test/recovery/t/019_replslot_limit.pl
@@ -333,23 +333,41 @@ $node_standby3->init_from_backup($node_primary3, $backup_name,
 $node_standby3->append_conf('postgresql.conf', "primary_slot_name = 'rep3'");
 $node_standby3->start;
 $node_primary3->wait_for_catchup($node_standby3);
-my $senderpid = $node_primary3->safe_psql('postgres',
-	"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
-
-# We've seen occasional cases where multiple walsender pids are active. An
-# immediate shutdown may hide evidence of a locking bug. So if multiple
-# walsenders are observed, shut down in fast mode, and collect some more
-# information.
-if (not like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid"))
+
+my $senderpid;
+
+# We've seen occasional cases where multiple walsender pids are active. It
+# could be that we're just observing process shutdown being slow. To collect
+# more information, retry a couple times, print a bit of debugging information
+# each iteration. For now report a test failure even if later iterations
+# succeed.
+my $i = 0;
+while (1)
 {
 	my ($stdout, $stderr);
+
+	$senderpid = $node_primary3->safe_psql('postgres',
+	    "SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
+
+	last if like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid");
+
+	# show information about all active connections
 	$node_primary3->psql('postgres',
 						 "\\a\\t\nSELECT * FROM pg_stat_activity",
 						 stdout => \$stdout, stderr => \$stderr);
 	diag $stdout, $stderr;
-	$node_primary3->stop('fast');
-	$node_standby3->stop('fast');
-	die "could not determine walsender pid, can't continue";
+
+	# unlikely that the problem would resolve after 15s, so give up at point
+	if ($i++ == 150)
+	{
+		# An immediate shutdown may hide evidence of a locking bug. If
+		# retrying didn't resolve the issue, shut down in fast mode.
+		$node_primary3->stop('fast');
+		$node_standby3->stop('fast');
+		die "could not determine walsender pid, can't continue";
+	}
+
+	usleep(100_000);
 }
 
 my $receiverpid = $node_standby3->safe_psql('postgres',
author	Andres Freund <andres@anarazel.de>	2022-03-24 17:12:09 -0700
committer	Andres Freund <andres@anarazel.de>	2022-03-24 17:12:09 -0700
commit	f28bf667f602f6ff36c219eb40c5f61de4440ae5 (patch)
tree	7a3ceb194e103af732361abbd7ac362d40fd77b6 /src
parent	26ebb0e28032283f99bf985fb47ea3d19fbaf91a (diff)
download	postgresql-f28bf667f602f6ff36c219eb40c5f61de4440ae5.tar.gz postgresql-f28bf667f602f6ff36c219eb40c5f61de4440ae5.zip