aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndres Freund <andres@anarazel.de>2022-02-25 16:58:48 -0800
committerAndres Freund <andres@anarazel.de>2022-02-25 17:04:39 -0800
commitfe0972ee5e6f8a663c5cf3f24ef98987c503da95 (patch)
tree6e631076d93f18709b1a73e7a9eb719bbcff955d
parent638300fef541fb9393caa1ee8821a639816301d1 (diff)
downloadpostgresql-fe0972ee5e6f8a663c5cf3f24ef98987c503da95.tar.gz
postgresql-fe0972ee5e6f8a663c5cf3f24ef98987c503da95.zip
Add further debug info to help debug 019_replslot_limit.pl failures.
See also afdeff10526. Failures after that commit provided a few more hints, but not yet enough to understand what's going on. In 019_replslot_limit.pl shut down nodes with fast instead of immediate mode if we observe the failure mode. That should tell us whether the failures we're observing are just a timing issue under high load. PGCTLTIMEOUT should prevent buildfarm animals from hanging endlessly. Also adds a bit more logging to replication slot drop and ShutdownPostgres(). Discussion: https://postgr.es/m/20220225192941.hqnvefgdzaro6gzg@alap3.anarazel.de
-rw-r--r--src/backend/replication/slot.c13
-rw-r--r--src/backend/storage/lmgr/lwlock.c7
-rw-r--r--src/backend/utils/init/postinit.c17
-rw-r--r--src/include/storage/lwlock.h1
-rw-r--r--src/test/recovery/t/019_replslot_limit.pl18
5 files changed, 55 insertions, 1 deletions
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 3d39fddaaef..f238a392ae7 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -569,6 +569,10 @@ restart:
if (!s->in_use)
continue;
+ /* unlocked read of active_pid is ok for debugging purposes */
+ elog(DEBUG3, "temporary replication slot cleanup: %d in use, active_pid: %d",
+ i, s->active_pid);
+
SpinLockAcquire(&s->mutex);
if (s->active_pid == MyProcPid)
{
@@ -629,6 +633,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
char path[MAXPGPATH];
char tmppath[MAXPGPATH];
+ /* temp debugging aid to analyze 019_replslot_limit failures */
+ elog(DEBUG3, "replication slot drop: %s: begin", NameStr(slot->data.name));
+
/*
* If some other backend ran this code concurrently with us, we might try
* to delete a slot with a certain name while someone else was trying to
@@ -679,6 +686,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
path, tmppath)));
}
+ elog(DEBUG3, "replication slot drop: %s: removed on-disk",
+ NameStr(slot->data.name));
+
/*
* The slot is definitely gone. Lock out concurrent scans of the array
* long enough to kill it. It's OK to clear the active PID here without
@@ -734,6 +744,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
* a slot while we're still cleaning up the detritus of the old one.
*/
LWLockRelease(ReplicationSlotAllocationLock);
+
+ elog(DEBUG3, "replication slot drop: %s: done",
+ NameStr(slot->data.name));
}
/*
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 7b0dea4abec..8f7f1b2f7c3 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -1945,3 +1945,10 @@ LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
}
return false;
}
+
+/* temp debugging aid to analyze 019_replslot_limit failures */
+int
+LWLockHeldCount(void)
+{
+ return num_held_lwlocks;
+}
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index a29fa0b3e6a..86d193c89fc 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -1262,6 +1262,23 @@ ShutdownPostgres(int code, Datum arg)
* them explicitly.
*/
LockReleaseAll(USER_LOCKMETHOD, true);
+
+ /*
+ * temp debugging aid to analyze 019_replslot_limit failures
+ *
+ * If an error were thrown outside of a transaction nothing up to now
+ * would have released lwlocks. We probably will add an
+ * LWLockReleaseAll(). But for now make it easier to understand such cases
+ * by warning if any lwlocks are held.
+ */
+#ifdef USE_ASSERT_CHECKING
+ {
+ int held_lwlocks = LWLockHeldCount();
+ if (held_lwlocks)
+ elog(WARNING, "holding %d lwlocks at the end of ShutdownPostgres()",
+ held_lwlocks);
+ }
+#endif
}
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 124977cf7e3..c3d5889d7b2 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -121,6 +121,7 @@ extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val);
extern void LWLockReleaseAll(void);
extern bool LWLockHeldByMe(LWLock *lock);
extern bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode);
+extern int LWLockHeldCount(void);
extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval);
extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 value);
diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl
index 0c9da9bf272..9bb71b62c06 100644
--- a/src/test/recovery/t/019_replslot_limit.pl
+++ b/src/test/recovery/t/019_replslot_limit.pl
@@ -335,7 +335,23 @@ $node_standby3->start;
$node_primary3->wait_for_catchup($node_standby3);
my $senderpid = $node_primary3->safe_psql('postgres',
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
-like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid");
+
+# We've seen occasional cases where multiple walsender pids are active. An
+# immediate shutdown may hide evidence of a locking bug. So if multiple
+# walsenders are observed, shut down in fast mode, and collect some more
+# information.
+if (not like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid"))
+{
+ my ($stdout, $stderr);
+ $node_primary3->psql('postgres',
+ "\\a\\t\nSELECT * FROM pg_stat_activity",
+ stdout => \$stdout, stderr => \$stderr);
+ diag $stdout, $stderr;
+ $node_primary3->stop('fast');
+ $node_standby3->stop('fast');
+ die "could not determine walsender pid, can't continue";
+}
+
my $receiverpid = $node_standby3->safe_psql('postgres',
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walreceiver'");
like($receiverpid, qr/^[0-9]+$/, "have walreceiver pid $receiverpid");