2 files changed, 138 insertions, 52 deletions
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index 6cb32fcb601..e181950c0fe 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -19,7 +19,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/pgarch.c,v 1.37 2008/01/01 19:45:51 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/pgarch.c,v 1.38 2008/01/11 00:54:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -77,12 +77,15 @@
  * ----------
  */
 static time_t last_pgarch_start_time;
+static time_t last_sigterm_time = 0;
 
 /*
  * Flags set by interrupt handlers for later service in the main loop.
  */
 static volatile sig_atomic_t got_SIGHUP = false;
+static volatile sig_atomic_t got_SIGTERM = false;
 static volatile sig_atomic_t wakened = false;
+static volatile sig_atomic_t ready_to_stop = false;
 
 /* ----------
  * Local function forward declarations
@@ -95,7 +98,9 @@ static pid_t pgarch_forkexec(void);
 NON_EXEC_STATIC void PgArchiverMain(int argc, char *argv[]);
 static void pgarch_exit(SIGNAL_ARGS);
 static void ArchSigHupHandler(SIGNAL_ARGS);
+static void ArchSigTermHandler(SIGNAL_ARGS);
 static void pgarch_waken(SIGNAL_ARGS);
+static void pgarch_waken_stop(SIGNAL_ARGS);
 static void pgarch_MainLoop(void);
 static void pgarch_ArchiverCopyLoop(void);
 static bool pgarch_archiveXlog(char *xlog);
@@ -236,16 +241,16 @@ PgArchiverMain(int argc, char *argv[])
 
 	/*
 	 * Ignore all signals usually bound to some action in the postmaster,
-	 * except for SIGHUP, SIGUSR1 and SIGQUIT.
+	 * except for SIGHUP, SIGTERM, SIGUSR1, SIGUSR2, and SIGQUIT.
 	 */
 	pqsignal(SIGHUP, ArchSigHupHandler);
 	pqsignal(SIGINT, SIG_IGN);
-	pqsignal(SIGTERM, SIG_IGN);
+	pqsignal(SIGTERM, ArchSigTermHandler);
 	pqsignal(SIGQUIT, pgarch_exit);
 	pqsignal(SIGALRM, SIG_IGN);
 	pqsignal(SIGPIPE, SIG_IGN);
 	pqsignal(SIGUSR1, pgarch_waken);
-	pqsignal(SIGUSR2, SIG_IGN);
+	pqsignal(SIGUSR2, pgarch_waken_stop);
 	pqsignal(SIGCHLD, SIG_DFL);
 	pqsignal(SIGTTIN, SIG_DFL);
 	pqsignal(SIGTTOU, SIG_DFL);
@@ -267,28 +272,47 @@ PgArchiverMain(int argc, char *argv[])
 static void
 pgarch_exit(SIGNAL_ARGS)
 {
-	/*
-	 * For now, we just nail the doors shut and get out of town.  It might
-	 * seem cleaner to finish up any pending archive copies, but there's a
-	 * nontrivial risk that init will kill us partway through.
-	 */
-	exit(0);
+	/* SIGQUIT means curl up and die ... */
+	exit(1);
 }
 
-/* SIGHUP: set flag to re-read config file at next convenient time */
+/* SIGHUP signal handler for archiver process */
 static void
 ArchSigHupHandler(SIGNAL_ARGS)
 {
+	/* set flag to re-read config file at next convenient time */
 	got_SIGHUP = true;
 }
 
+/* SIGTERM signal handler for archiver process */
+static void
+ArchSigTermHandler(SIGNAL_ARGS)
+{
+	/*
+	 * The postmaster never sends us SIGTERM, so we assume that this means
+	 * that init is trying to shut down the whole system.  If we hang around
+	 * too long we'll get SIGKILL'd.  Set flag to prevent starting any more
+	 * archive commands.
+	 */
+	got_SIGTERM = true;
+}
+
 /* SIGUSR1 signal handler for archiver process */
 static void
 pgarch_waken(SIGNAL_ARGS)
 {
+	/* set flag that there is work to be done */
 	wakened = true;
 }
 
+/* SIGUSR2 signal handler for archiver process */
+static void
+pgarch_waken_stop(SIGNAL_ARGS)
+{
+	/* set flag to do a final cycle and shut down afterwards */
+	ready_to_stop = true;
+}
+
 /*
  * pgarch_MainLoop
  *
@@ -298,6 +322,7 @@ static void
 pgarch_MainLoop(void)
 {
 	time_t		last_copy_time = 0;
+	bool		time_to_stop;
 
 	/*
 	 * We run the copy loop immediately upon entry, in case there are
@@ -309,6 +334,9 @@ pgarch_MainLoop(void)
 
 	do
 	{
+		/* When we get SIGUSR2, we do one more archive cycle, then exit */
+		time_to_stop = ready_to_stop;
+
 		/* Check for config update */
 		if (got_SIGHUP)
 		{
@@ -316,8 +344,26 @@ pgarch_MainLoop(void)
 			ProcessConfigFile(PGC_SIGHUP);
 		}
 
+		/*
+		 * If we've gotten SIGTERM, we normally just sit and do nothing until
+		 * SIGUSR2 arrives.  However, that means a random SIGTERM would
+		 * disable archiving indefinitely, which doesn't seem like a good
+		 * idea.  If more than 60 seconds pass since SIGTERM, exit anyway,
+		 * so that the postmaster can start a new archiver if needed.
+		 */
+		if (got_SIGTERM)
+		{
+			time_t		curtime = time(NULL);
+
+			if (last_sigterm_time == 0)
+				last_sigterm_time = curtime;
+			else if ((unsigned int) (curtime - last_sigterm_time) >=
+					 (unsigned int) 60)
+				break;
+		}
+
 		/* Do what we're here for */
-		if (wakened)
+		if (wakened || time_to_stop)
 		{
 			wakened = false;
 			pgarch_ArchiverCopyLoop();
@@ -334,7 +380,8 @@ pgarch_MainLoop(void)
 		 * sleep into 1-second increments, and check for interrupts after each
 		 * nap.
 		 */
-		while (!(wakened || got_SIGHUP))
+		while (!(wakened || ready_to_stop || got_SIGHUP ||
+				 !PostmasterIsAlive(true)))
 		{
 			time_t		curtime;
 
@@ -344,7 +391,13 @@ pgarch_MainLoop(void)
 				(unsigned int) PGARCH_AUTOWAKE_INTERVAL)
 				wakened = true;
 		}
-	} while (PostmasterIsAlive(true));
+
+		/*
+		 * The archiver quits either when the postmaster dies (not expected)
+		 * or after completing one more archiving cycle after receiving
+		 * SIGUSR2.
+		 */
+	} while (PostmasterIsAlive(true) && !time_to_stop);
 }
 
 /*
@@ -377,8 +430,14 @@ pgarch_ArchiverCopyLoop(void)
 
 		for (;;)
 		{
-			/* Abandon processing if we notice our postmaster has died */
-			if (!PostmasterIsAlive(true))
+			/*
+			 * Do not initiate any more archive commands after receiving
+			 * SIGTERM, nor after the postmaster has died unexpectedly.
+			 * The first condition is to try to keep from having init
+			 * SIGKILL the command, and the second is to avoid conflicts
+			 * with another archiver spawned by a newer postmaster.
+			 */
+			if (got_SIGTERM || !PostmasterIsAlive(true))
 				return;
 
 			if (pgarch_archiveXlog(xlog))
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index ba6c9b9183e..fe1ed795f91 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -37,7 +37,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.550 2008/01/01 19:45:51 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.551 2008/01/11 00:54:09 tgl Exp $
  *
  * NOTES
  *
@@ -244,7 +244,7 @@ static bool FatalError = false; /* T if recovering from backend crash */
  * Notice that this state variable does not distinguish *why* we entered
  * PM_WAIT_BACKENDS or later states --- Shutdown and FatalError must be
  * consulted to find that out.	FatalError is never true in PM_RUN state, nor
- * in PM_SHUTDOWN state (because we don't enter that state when trying to
+ * in PM_SHUTDOWN states (because we don't enter those states when trying to
  * recover from a crash).  It can be true in PM_STARTUP state, because we
  * don't clear it until we've successfully recovered.
  */
@@ -255,6 +255,7 @@ typedef enum
 	PM_RUN,						/* normal "database is alive" state */
 	PM_WAIT_BACKENDS,			/* waiting for live backends to exit */
 	PM_SHUTDOWN,				/* waiting for bgwriter to do shutdown ckpt */
+	PM_SHUTDOWN_2,				/* waiting for archiver to finish */
 	PM_WAIT_DEAD_END,			/* waiting for dead_end children to exit */
 	PM_NO_CHILDREN				/* all important children have exited */
 } PMState;
@@ -1312,12 +1313,8 @@ ServerLoop(void)
 				start_autovac_launcher = false; /* signal processed */
 		}
 
-		/*
-		 * If we have lost the archiver, try to start a new one. We do this
-		 * even if we are shutting down, to allow archiver to take care of any
-		 * remaining WAL files.
-		 */
-		if (XLogArchivingActive() && PgArchPID == 0 && pmState >= PM_RUN)
+		/* If we have lost the archiver, try to start a new one */
+		if (XLogArchivingActive() && PgArchPID == 0 && pmState == PM_RUN)
 			PgArchPID = pgarch_start();
 
 		/* If we have lost the stats collector, try to start a new one */
@@ -2175,12 +2172,31 @@ reaper(SIGNAL_ARGS)
 				 * checkpoint.	(If for some reason it didn't, recovery will
 				 * occur on next postmaster start.)
 				 *
-				 * At this point we should have no normal children left (else
-				 * we'd not be in PM_SHUTDOWN state) but we might have
-				 * dead_end children.
+				 * At this point we should have no normal backend children
+				 * left (else we'd not be in PM_SHUTDOWN state) but we might
+				 * have dead_end children to wait for.
+				 *
+				 * If we have an archiver subprocess, tell it to do a last
+				 * archive cycle and quit; otherwise we can go directly to
+				 * PM_WAIT_DEAD_END state.
 				 */
 				Assert(Shutdown > NoShutdown);
-				pmState = PM_WAIT_DEAD_END;
+
+				if (PgArchPID != 0)
+				{
+					/* Waken archiver for the last time */
+					signal_child(PgArchPID, SIGUSR2);
+					pmState = PM_SHUTDOWN_2;
+				}
+				else
+					pmState = PM_WAIT_DEAD_END;
+
+				/*
+				 * We can also shut down the stats collector now; there's
+				 * nothing left for it to do.
+				 */
+				if (PgStatPID != 0)
+					signal_child(PgStatPID, SIGQUIT);
 			}
 			else
 			{
@@ -2227,7 +2243,8 @@ reaper(SIGNAL_ARGS)
 		/*
 		 * Was it the archiver?  If so, just try to start a new one; no need
 		 * to force reset of the rest of the system.  (If fail, we'll try
-		 * again in future cycles of the main loop.)
+		 * again in future cycles of the main loop.)  But if we were waiting
+		 * for it to shut down, advance to the next shutdown step.
 		 */
 		if (pid == PgArchPID)
 		{
@@ -2235,8 +2252,10 @@ reaper(SIGNAL_ARGS)
 			if (!EXIT_STATUS_0(exitstatus))
 				LogChildExit(LOG, _("archiver process"),
 							 pid, exitstatus);
-			if (XLogArchivingActive() && pmState >= PM_RUN)
+			if (XLogArchivingActive() && pmState == PM_RUN)
 				PgArchPID = pgarch_start();
+			else if (pmState == PM_SHUTDOWN_2)
+				pmState = PM_WAIT_DEAD_END;
 			continue;
 		}
 
@@ -2563,6 +2582,11 @@ PostmasterStateMachine(void)
 				 * change causes ServerLoop to stop creating new ones.
 				 */
 				pmState = PM_WAIT_DEAD_END;
+
+				/*
+				 * We already SIGQUIT'd the archiver and stats processes,
+				 * if any, when we entered FatalError state.
+				 */
 			}
 			else
 			{
@@ -2591,13 +2615,13 @@ PostmasterStateMachine(void)
 					 */
 					FatalError = true;
 					pmState = PM_WAIT_DEAD_END;
+
+					/* Kill the archiver and stats collector too */
+					if (PgArchPID != 0)
+						signal_child(PgArchPID, SIGQUIT);
+					if (PgStatPID != 0)
+						signal_child(PgStatPID, SIGQUIT);
 				}
-				/* Tell pgarch to shut down too; nothing left for it to do */
-				if (PgArchPID != 0)
-					signal_child(PgArchPID, SIGQUIT);
-				/* Tell pgstat to shut down too; nothing left for it to do */
-				if (PgStatPID != 0)
-					signal_child(PgStatPID, SIGQUIT);
 			}
 		}
 	}
@@ -2606,16 +2630,26 @@ PostmasterStateMachine(void)
 	{
 		/*
 		 * PM_WAIT_DEAD_END state ends when the BackendList is entirely empty
-		 * (ie, no dead_end children remain).
+		 * (ie, no dead_end children remain), and the archiver and stats
+		 * collector are gone too.
+		 *
+		 * The reason we wait for those two is to protect them against a new
+		 * postmaster starting conflicting subprocesses; this isn't an
+		 * ironclad protection, but it at least helps in the
+		 * shutdown-and-immediately-restart scenario.  Note that they have
+		 * already been sent appropriate shutdown signals, either during a
+		 * normal state transition leading up to PM_WAIT_DEAD_END, or during
+		 * FatalError processing.
 		 */
-		if (!DLGetHead(BackendList))
+		if (DLGetHead(BackendList) == NULL &&
+			PgArchPID == 0 && PgStatPID == 0)
 		{
 			/* These other guys should be dead already */
 			Assert(StartupPID == 0);
 			Assert(BgWriterPID == 0);
 			Assert(WalWriterPID == 0);
 			Assert(AutoVacPID == 0);
-			/* archiver, stats, and syslogger are not considered here */
+			/* syslogger is not considered here */
 			pmState = PM_NO_CHILDREN;
 		}
 	}
@@ -2628,14 +2662,9 @@ PostmasterStateMachine(void)
 	 * we got SIGTERM from init --- there may well not be time for recovery
 	 * before init decides to SIGKILL us.)
 	 *
-	 * Note: we do not wait around for exit of the archiver or stats
-	 * processes.  They've been sent SIGQUIT by this point (either when we
-	 * entered PM_SHUTDOWN state, or when we set FatalError, and at least one
-	 * of those must have happened by now).  In any case they contain logic to
-	 * commit hara-kiri if they notice the postmaster is gone.	Since they
-	 * aren't connected to shared memory, they pose no problem for shutdown.
-	 * The syslogger is not considered either, since it's intended to survive
-	 * till the postmaster exits.
+	 * Note that the syslogger continues to run.  It will exit when it sees
+	 * EOF on its input pipe, which happens when there are no more upstream
+	 * processes.
 	 */
 	if (Shutdown > NoShutdown && pmState == PM_NO_CHILDREN)
 	{
@@ -2652,10 +2681,8 @@ PostmasterStateMachine(void)
 	}
 
 	/*
-	 * If we need to recover from a crash, wait for all shmem-connected
-	 * children to exit, then reset shmem and StartupDataBase.	(We can ignore
-	 * the archiver and stats processes here since they are not connected to
-	 * shmem.)
+	 * If we need to recover from a crash, wait for all non-syslogger
+	 * children to exit, then reset shmem and StartupDataBase.
 	 */
 	if (FatalError && pmState == PM_NO_CHILDREN)
 	{
@@ -3782,7 +3809,7 @@ sigusr1_handler(SIGNAL_ARGS)
 	}
 
 	if (CheckPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER) &&
-		PgArchPID != 0 && Shutdown <= SmartShutdown)
+		PgArchPID != 0)
 	{
 		/*
 		 * Send SIGUSR1 to archiver process, to wake it up and begin archiving