aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorSimon Riggs <simon@2ndQuadrant.com>2011-09-07 09:09:47 +0100
committerSimon Riggs <simon@2ndQuadrant.com>2011-09-07 09:09:47 +0100
commitdde70cc313683e47e71997759c6029b4220f71c0 (patch)
treeffede6886e20a2b105afe793dc823c16a9af96f3 /src
parent39039e6d7a48d37aedcfca7973cea3288ce356d4 (diff)
downloadpostgresql-dde70cc313683e47e71997759c6029b4220f71c0.tar.gz
postgresql-dde70cc313683e47e71997759c6029b4220f71c0.zip
Emit cascaded standby message on shutdown only when appropriate.
Adds additional test for active walsenders and closes a race condition for when we failover when a new walsender was connecting. Reported and fixed bu Fujii Masao. Review by Heikki Linnakangas
Diffstat (limited to 'src')
-rw-r--r--src/backend/postmaster/postmaster.c5
-rw-r--r--src/backend/replication/walsender.c29
2 files changed, 32 insertions, 2 deletions
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 6e231a538f4..df4a2aa8853 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -2328,10 +2328,11 @@ reaper(SIGNAL_ARGS)
* XXX should avoid the need for disconnection. When we do,
* am_cascading_walsender should be replaced with RecoveryInProgress()
*/
- if (max_wal_senders > 0)
+ if (max_wal_senders > 0 && CountChildren(BACKEND_TYPE_WALSND) > 0)
{
ereport(LOG,
- (errmsg("terminating all walsender processes to force cascaded standby(s) to update timeline and reconnect")));
+ (errmsg("terminating all walsender processes to force cascaded "
+ "standby(s) to update timeline and reconnect")));
SignalSomeChildren(SIGUSR2, BACKEND_TYPE_WALSND);
}
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 0e8098abf4a..474567a2042 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -369,6 +369,35 @@ StartReplication(StartReplicationCmd *cmd)
SendPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE);
/*
+ * When promoting a cascading standby, postmaster sends SIGUSR2 to
+ * any cascading walsenders to kill them. But there is a corner-case where
+ * such walsender fails to receive SIGUSR2 and survives a standby promotion
+ * unexpectedly. This happens when postmaster sends SIGUSR2 before
+ * the walsender marks itself as a WAL sender, because postmaster sends
+ * SIGUSR2 to only the processes marked as a WAL sender.
+ *
+ * To avoid this corner-case, if recovery is NOT in progress even though
+ * the walsender is cascading one, we do the same thing as SIGUSR2 signal
+ * handler does, i.e., set walsender_ready_to_stop to true. Which causes
+ * the walsender to end later.
+ *
+ * When terminating cascading walsenders, usually postmaster writes
+ * the log message announcing the terminations. But there is a race condition
+ * here. If there is no walsender except this process before reaching here,
+ * postmaster thinks that there is no walsender and suppresses that
+ * log message. To handle this case, we always emit that log message here.
+ * This might cause duplicate log messages, but which is less likely to happen,
+ * so it's not worth writing some code to suppress them.
+ */
+ if (am_cascading_walsender && !RecoveryInProgress())
+ {
+ ereport(LOG,
+ (errmsg("terminating walsender process to force cascaded standby "
+ "to update timeline and reconnect")));
+ walsender_ready_to_stop = true;
+ }
+
+ /*
* We assume here that we're logging enough information in the WAL for
* log-shipping, since this is checked in PostmasterMain().
*