6 files changed, 244 insertions, 131 deletions
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 4888b0b36b9..dda3d03b149 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -42,7 +42,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.30 2009/01/20 18:59:37 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.31 2009/06/26 20:29:04 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1543,7 +1543,7 @@ CheckPointMultiXact(void)
 	 * SimpleLruTruncate would get confused.  It seems best not to risk
 	 * removing any data during recovery anyway, so don't truncate.
 	 */
-	if (!InRecovery)
+	if (!RecoveryInProgress())
 		TruncateMultiXact();
 
 	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 401d805a8f5..5990bae8b84 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.344 2009/06/25 21:36:00 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.345 2009/06/26 20:29:04 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -124,24 +124,36 @@ TimeLineID	ThisTimeLineID = 0;
 /*
  * Are we doing recovery from XLOG?
  *
- * This is only ever true in the startup process, even if the system is still
- * in recovery. Prior to 8.4, all activity during recovery were carried out
- * by Startup process. This local variable continues to be used in functions
- * that need to act differently when called from a redo function (e.g skip
- * WAL logging). To check whether the system is in recovery regardless of what
+ * This is only ever true in the startup process; it should be read as meaning
+ * "this process is replaying WAL records", rather than "the system is in
+ * recovery mode".  It should be examined primarily by functions that need
+ * to act differently when called from a WAL redo function (e.g., to skip WAL
+ * logging).  To check whether the system is in recovery regardless of which
  * process you're running in, use RecoveryInProgress().
  */
 bool		InRecovery = false;
 
-/* Are we recovering using offline XLOG archives? */
-static bool InArchiveRecovery = false;
-
 /*
  * Local copy of SharedRecoveryInProgress variable. True actually means "not
- * known, need to check the shared state"
+ * known, need to check the shared state".
  */
 static bool LocalRecoveryInProgress = true;
 
+/*
+ * Local state for XLogInsertAllowed():
+ *		1: unconditionally allowed to insert XLOG
+ *		0: unconditionally not allowed to insert XLOG
+ *		-1: must check RecoveryInProgress(); disallow until it is false
+ * Most processes start with -1 and transition to 1 after seeing that recovery
+ * is not in progress.  But we can also force the value for special cases.
+ * The coding in XLogInsertAllowed() depends on the first two of these states
+ * being numerically the same as bool true and false.
+ */
+static int	LocalXLogInsertAllowed = -1;
+
+/* Are we recovering using offline XLOG archives? */
+static bool InArchiveRecovery = false;
+
 /* Was the last xlog file restored from archive, or local? */
 static bool restoredFromArchive = false;
 
@@ -260,7 +272,8 @@ static XLogRecPtr RedoRecPtr;
  * new log file.
  *
  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
- * only one checkpointer at a time)
+ * only one checkpointer at a time; currently, with all checkpoints done by
+ * the bgwriter, this is just pro forma).
  *
  *----------
  */
@@ -331,7 +344,7 @@ typedef struct XLogCtlData
 
 	/*
 	 * SharedRecoveryInProgress indicates if we're still in crash or archive
-	 * recovery.  It's checked by RecoveryInProgress().
+	 * recovery.  Protected by info_lck.
 	 */
 	bool		SharedRecoveryInProgress;
 
@@ -421,6 +434,7 @@ static XLogRecPtr ReadRecPtr;	/* start of last record read */
 static XLogRecPtr EndRecPtr;	/* end+1 of last record read */
 static XLogRecord *nextRecord = NULL;
 static TimeLineID lastPageTLI = 0;
+
 static XLogRecPtr minRecoveryPoint;		/* local copy of
 										 * ControlFile->minRecoveryPoint */
 static bool updateMinRecoveryPoint = true;
@@ -428,7 +442,7 @@ static bool updateMinRecoveryPoint = true;
 static bool InRedo = false;
 
 /*
- * Flag set by interrupt handlers for later service in the redo loop.
+ * Flags set by interrupt handlers for later service in the redo loop.
  */
 static volatile sig_atomic_t got_SIGHUP = false;
 static volatile sig_atomic_t shutdown_requested = false;
@@ -537,8 +551,8 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 
 	/* cross-check on whether we should be here or not */
-	if (RecoveryInProgress())
-		elog(FATAL, "cannot make new WAL entries during recovery");
+	if (!XLogInsertAllowed())
+		elog(ERROR, "cannot make new WAL entries during recovery");
 
 	/* info's high bits are reserved for use by me */
 	if (info & XLR_INFO_MASK)
@@ -1780,7 +1794,7 @@ XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
  * database is consistent.
  *
  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
- * is is only updated if it's not already greater than or equal to 'lsn'.
+ * is only updated if it's not already greater than or equal to 'lsn'.
  */
 static void
 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
@@ -1796,7 +1810,8 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
 
 	/*
 	 * An invalid minRecoveryPoint means that we need to recover all the WAL,
-	 * ie. crash recovery. Don't update the control file in that case.
+	 * i.e., we're doing crash recovery.  We never modify the control file's
+	 * value in that case, so we can short-circuit future checks here too.
 	 */
 	if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
 		updateMinRecoveryPoint = false;
@@ -1809,12 +1824,26 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
 		/*
 		 * To avoid having to update the control file too often, we update it
 		 * all the way to the last record being replayed, even though 'lsn'
-		 * would suffice for correctness.
+		 * would suffice for correctness.  This also allows the 'force' case
+		 * to not need a valid 'lsn' value.
+		 *
+		 * Another important reason for doing it this way is that the passed
+		 * 'lsn' value could be bogus, i.e., past the end of available WAL,
+		 * if the caller got it from a corrupted heap page.  Accepting such
+		 * a value as the min recovery point would prevent us from coming up
+		 * at all.  Instead, we just log a warning and continue with recovery.
+		 * (See also the comments about corrupt LSNs in XLogFlush.)
 		 */
 		SpinLockAcquire(&xlogctl->info_lck);
 		newMinRecoveryPoint = xlogctl->replayEndRecPtr;
 		SpinLockRelease(&xlogctl->info_lck);
 
+		if (!force && XLByteLT(newMinRecoveryPoint, lsn))
+			elog(WARNING,
+				 "xlog min recovery request %X/%X is past current point %X/%X",
+				 lsn.xlogid, lsn.xrecoff,
+				 newMinRecoveryPoint.xlogid, newMinRecoveryPoint.xrecoff);
+
 		/* update control file */
 		if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
 		{
@@ -1843,10 +1872,13 @@ XLogFlush(XLogRecPtr record)
 	XLogwrtRqst WriteRqst;
 
 	/*
-	 * During REDO, we don't try to flush the WAL, but update minRecoveryPoint
-	 * instead.
+	 * During REDO, we are reading not writing WAL.  Therefore, instead of
+	 * trying to flush the WAL, we should update minRecoveryPoint instead.
+	 * We test XLogInsertAllowed(), not InRecovery, because we need the
+	 * bgwriter to act this way too, and because when the bgwriter tries
+	 * to write the end-of-recovery checkpoint, it should indeed flush.
 	 */
-	if (RecoveryInProgress())
+	if (!XLogInsertAllowed())
 	{
 		UpdateMinRecoveryPoint(record, false);
 		return;
@@ -1935,21 +1967,20 @@ XLogFlush(XLogRecPtr record)
 	 * system's robustness rather than helping it: we do not want to take down
 	 * the whole system due to corruption on one data page.  In particular, if
 	 * the bad page is encountered again during recovery then we would be
-	 * unable to restart the database at all!  (This scenario has actually
-	 * happened in the field several times with 7.1 releases. Note that we
-	 * cannot get here while RecoveryInProgress(), but if the bad page is
-	 * brought in and marked dirty during recovery then if a checkpoint were
-	 * performed at the end of recovery it will try to flush it.
+	 * unable to restart the database at all!  (This scenario actually
+	 * happened in the field several times with 7.1 releases.)  As of 8.4,
+	 * bad LSNs encountered during recovery are UpdateMinRecoveryPoint's
+	 * problem; the only time we can reach here during recovery is while
+	 * flushing the end-of-recovery checkpoint record, and we don't expect
+	 * that to have a bad LSN.
 	 *
-	 * The current approach is to ERROR under normal conditions, but only
-	 * WARNING during recovery, so that the system can be brought up even if
-	 * there's a corrupt LSN.  Note that for calls from xact.c, the ERROR will
+	 * Note that for calls from xact.c, the ERROR will
 	 * be promoted to PANIC since xact.c calls this routine inside a critical
 	 * section.  However, calls from bufmgr.c are not within critical sections
 	 * and so we will not force a restart for a bad LSN on a data page.
 	 */
 	if (XLByteLT(LogwrtResult.Flush, record))
-		elog(InRecovery ? WARNING : ERROR,
+		elog(ERROR,
 		"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
 			 record.xlogid, record.xrecoff,
 			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
@@ -2751,7 +2782,7 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 
 	/*
 	 * Set in_restore_command to tell the signal handler that we should exit
-	 * right away on SIGTERM. We know that we're in a safe point to do that.
+	 * right away on SIGTERM. We know that we're at a safe point to do that.
 	 * Check if we had already received the signal, so that we don't miss a
 	 * shutdown request received just before this.
 	 */
@@ -2833,7 +2864,7 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	 * problems such as an unfindable command; treat those as fatal errors
 	 * too.
 	 */
-	if (WTERMSIG(rc) == SIGTERM)
+	if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
 		proc_exit(1);
 
 	signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
@@ -4543,6 +4574,7 @@ XLOGShmemInit(void)
 	 * in additional info.)
 	 */
 	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
+	XLogCtl->SharedRecoveryInProgress = true;
 	XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
 	SpinLockInit(&XLogCtl->info_lck);
 
@@ -5164,8 +5196,6 @@ StartupXLOG(void)
 	TransactionId oldestActiveXID;
 	bool		bgwriterLaunched = false;
 
-	XLogCtl->SharedRecoveryInProgress = true;
-
 	/*
 	 * Read control file and check XLOG status looks valid.
 	 *
@@ -5392,7 +5422,7 @@ StartupXLOG(void)
 		/* No need to hold ControlFileLock yet, we aren't up far enough */
 		UpdateControlFile();
 
-		/* update our local copy of minRecoveryPoint */
+		/* initialize our local copy of minRecoveryPoint */
 		minRecoveryPoint = ControlFile->minRecoveryPoint;
 
 		/*
@@ -5450,7 +5480,7 @@ StartupXLOG(void)
 			/* use volatile pointer to prevent code rearrangement */
 			volatile XLogCtlData *xlogctl = XLogCtl;
 
-			/* Update shared replayEndRecPtr */
+			/* initialize shared replayEndRecPtr */
 			SpinLockAcquire(&xlogctl->info_lck);
 			xlogctl->replayEndRecPtr = ReadRecPtr;
 			SpinLockRelease(&xlogctl->info_lck);
@@ -5476,7 +5506,8 @@ StartupXLOG(void)
 			 * recovering after crash.
 			 *
 			 * After this point, we can no longer assume that we're the only
-			 * process in addition to postmaster!
+			 * process in addition to postmaster!  Also, fsync requests are
+			 * subsequently to be handled by the bgwriter, not locally.
 			 */
 			if (InArchiveRecovery && IsUnderPostmaster)
 			{
@@ -5526,11 +5557,11 @@ StartupXLOG(void)
 					proc_exit(1);
 
 				/*
-				 * Have we reached our safe starting point? If so, we can tell
+				 * Have we passed our safe starting point? If so, we can tell
 				 * postmaster that the database is consistent now.
 				 */
 				if (!reachedMinRecoveryPoint &&
-					XLByteLE(minRecoveryPoint, EndRecPtr))
+					XLByteLT(minRecoveryPoint, EndRecPtr))
 				{
 					reachedMinRecoveryPoint = true;
 					if (InArchiveRecovery)
@@ -5616,7 +5647,10 @@ StartupXLOG(void)
 
 	/*
 	 * Complain if we did not roll forward far enough to render the backup
-	 * dump consistent.
+	 * dump consistent.  Note: it is indeed okay to look at the local variable
+	 * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
+	 * be further ahead --- ControlFile->minRecoveryPoint cannot have been
+	 * advanced beyond the WAL we processed.
 	 */
 	if (InRecovery && XLByteLT(EndOfLog, minRecoveryPoint))
 	{
@@ -5816,14 +5850,27 @@ StartupXLOG(void)
 	}
 
 	/*
-	 * All done. Allow backends to write WAL.
+	 * All done.  Allow backends to write WAL.  (Although the bool flag is
+	 * probably atomic in itself, we use the info_lck here to ensure that
+	 * there are no race conditions concerning visibility of other recent
+	 * updates to shared memory.)
 	 */
-	XLogCtl->SharedRecoveryInProgress = false;
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile XLogCtlData *xlogctl = XLogCtl;
+
+		SpinLockAcquire(&xlogctl->info_lck);
+		xlogctl->SharedRecoveryInProgress = false;
+		SpinLockRelease(&xlogctl->info_lck);
+	}
 }
 
 /*
  * Is the system still in recovery?
  *
+ * Unlike testing InRecovery, this works in any process that's connected to
+ * shared memory.
+ *
  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
  * variables the first time we see that recovery is finished.
  */
@@ -5831,9 +5878,9 @@ bool
 RecoveryInProgress(void)
 {
 	/*
-	 * We check shared state each time only until we leave recovery mode. We
-	 * can't re-enter recovery, so we rely on the local state variable after
-	 * that.
+	 * We check shared state each time only until we leave recovery mode.
+	 * We can't re-enter recovery, so there's no need to keep checking after
+	 * the shared variable has once been seen false.
 	 */
 	if (!LocalRecoveryInProgress)
 		return false;
@@ -5842,11 +5889,15 @@ RecoveryInProgress(void)
 		/* use volatile pointer to prevent code rearrangement */
 		volatile XLogCtlData *xlogctl = XLogCtl;
 
+		/* spinlock is essential on machines with weak memory ordering! */
+		SpinLockAcquire(&xlogctl->info_lck);
 		LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
+		SpinLockRelease(&xlogctl->info_lck);
 
 		/*
-		 * Initialize TimeLineID and RedoRecPtr the first time we see that
-		 * recovery is finished.
+		 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
+		 * is finished.  (If you change this, see also
+		 * LocalSetXLogInsertAllowed.)
 		 */
 		if (!LocalRecoveryInProgress)
 			InitXLOGAccess();
@@ -5856,6 +5907,51 @@ RecoveryInProgress(void)
 }
 
 /*
+ * Is this process allowed to insert new WAL records?
+ *
+ * Ordinarily this is essentially equivalent to !RecoveryInProgress().
+ * But we also have provisions for forcing the result "true" or "false"
+ * within specific processes regardless of the global state.
+ */
+bool
+XLogInsertAllowed(void)
+{
+	/*
+	 * If value is "unconditionally true" or "unconditionally false",
+	 * just return it.  This provides the normal fast path once recovery
+	 * is known done.
+	 */
+	if (LocalXLogInsertAllowed >= 0)
+		return (bool) LocalXLogInsertAllowed;
+
+	/*
+	 * Else, must check to see if we're still in recovery.
+	 */
+	if (RecoveryInProgress())
+		return false;
+
+	/*
+	 * On exit from recovery, reset to "unconditionally true", since there
+	 * is no need to keep checking.
+	 */
+	LocalXLogInsertAllowed = 1;
+	return true;
+}
+
+/*
+ * Make XLogInsertAllowed() return true in the current process only.
+ */
+static void
+LocalSetXLogInsertAllowed(void)
+{
+	Assert(LocalXLogInsertAllowed == -1);
+	LocalXLogInsertAllowed = 1;
+
+	/* Initialize as RecoveryInProgress() would do when switching state */
+	InitXLOGAccess();
+}
+
+/*
  * Subroutine to try to fetch and validate a prior checkpoint record.
  *
  * whichChkpt identifies the checkpoint (merely for reporting purposes).
@@ -6126,7 +6222,7 @@ ShutdownXLOG(int code, Datum arg)
 static void
 LogCheckpointStart(int flags, bool restartpoint)
 {
-	char	   *msg;
+	const char *msg;
 
 	/*
 	 * XXX: This is hopelessly untranslatable. We could call gettext_noop for
@@ -6205,7 +6301,7 @@ LogCheckpointEnd(bool restartpoint)
  *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
  *		ignoring checkpoint_completion_target parameter.
  *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
- *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN and
+ *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
  *		CHECKPOINT_END_OF_RECOVERY).
  *
  * Note: flags contains other bits, of interest here only for logging purposes.
@@ -6225,44 +6321,19 @@ CreateCheckPoint(int flags)
 	uint32		_logSeg;
 	TransactionId *inCommitXids;
 	int			nInCommit;
-	bool		OldInRecovery = InRecovery;
 
 	/*
 	 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
 	 * issued at a different time.
 	 */
-	if (flags & ((CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY) != 0))
+	if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
 		shutdown = true;
 	else
 		shutdown = false;
 
-	/*
-	 * A startup checkpoint is created before anyone else is allowed to
-	 * write WAL. To allow us to write the checkpoint record, set
-	 * LocalRecoveryInProgress to false. This lets us write WAL, but others
-	 * are still not allowed to do so.
-	 */
-	if (flags & CHECKPOINT_END_OF_RECOVERY)
-	{
-		Assert(RecoveryInProgress());
-		LocalRecoveryInProgress = false;
-		InitXLOGAccess();
-
-		/*
-		 * Before 8.4, end-of-recovery checkpoints were always performed by
-		 * the startup process, and InRecovery was set true. InRecovery is not
-		 * normally set in bgwriter, but we set it here temporarily to avoid
-		 * confusing old code in the end-of-recovery checkpoint code path that
-		 * rely on it.
-		 */
-		InRecovery = true;
-	}
-	else
-	{
-		/* shouldn't happen */
-		if (RecoveryInProgress())
-			elog(ERROR, "can't create a checkpoint during recovery");
-	}
+	/* sanity check */
+	if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
+		elog(ERROR, "can't create a checkpoint during recovery");
 
 	/*
 	 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
@@ -6305,7 +6376,6 @@ CreateCheckPoint(int flags)
 
 	/* Begin filling in the checkpoint WAL record */
 	MemSet(&checkPoint, 0, sizeof(checkPoint));
-	checkPoint.ThisTimeLineID = ThisTimeLineID;
 	checkPoint.time = (pg_time_t) time(NULL);
 
 	/*
@@ -6473,6 +6543,20 @@ CreateCheckPoint(int flags)
 	START_CRIT_SECTION();
 
 	/*
+	 * An end-of-recovery checkpoint is created before anyone is allowed to
+	 * write WAL. To allow us to write the checkpoint record, temporarily
+	 * enable XLogInsertAllowed.
+	 */
+	if (flags & CHECKPOINT_END_OF_RECOVERY)
+		LocalSetXLogInsertAllowed();
+
+	/*
+	 * This needs to be done after LocalSetXLogInsertAllowed(), else
+	 * ThisTimeLineID might still be uninitialized.
+	 */
+	checkPoint.ThisTimeLineID = ThisTimeLineID;
+
+	/*
 	 * Now insert the checkpoint record into XLOG.
 	 */
 	rdata.data = (char *) (&checkPoint);
@@ -6488,6 +6572,21 @@ CreateCheckPoint(int flags)
 	XLogFlush(recptr);
 
 	/*
+	 * We mustn't write any new WAL after a shutdown checkpoint, or it will
+	 * be overwritten at next startup.  No-one should even try, this just
+	 * allows sanity-checking.  In the case of an end-of-recovery checkpoint,
+	 * we want to just temporarily disable writing until the system has exited
+	 * recovery.
+	 */
+	if (shutdown)
+	{
+		if (flags & CHECKPOINT_END_OF_RECOVERY)
+			LocalXLogInsertAllowed = -1;	/* return to "check" state */
+		else
+			LocalXLogInsertAllowed = 0;		/* never again write WAL */
+	}
+
+	/*
 	 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
 	 * = end of actual checkpoint record.
 	 */
@@ -6560,7 +6659,7 @@ CreateCheckPoint(int flags)
 	 * in subtrans.c).	During recovery, though, we mustn't do this because
 	 * StartupSUBTRANS hasn't been called yet.
 	 */
-	if (!InRecovery)
+	if (!RecoveryInProgress())
 		TruncateSUBTRANS(GetOldestXmin(true, false));
 
 	/* All real work is done, but log before releasing lock. */
@@ -6574,9 +6673,6 @@ CreateCheckPoint(int flags)
 									 CheckpointStats.ckpt_segs_recycled);
 
 	LWLockRelease(CheckpointLock);
-
-	/* Restore old value */
-	InRecovery = OldInRecovery;
 }
 
 /*
@@ -6597,10 +6693,14 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 }
 
 /*
- * This is used during WAL recovery to establish a point from which recovery
- * can roll forward without replaying the entire recovery log.	This function
- * is called each time a checkpoint record is read from XLOG. It is stored
- * in shared memory, so that it can be used as a restartpoint later on.
+ * Save a checkpoint for recovery restart if appropriate
+ *
+ * This function is called each time a checkpoint record is read from XLOG.
+ * It must determine whether the checkpoint represents a safe restartpoint or
+ * not.  If so, the checkpoint record is stashed in shared memory so that
+ * CreateRestartPoint can consult it.  (Note that the latter function is
+ * executed by the bgwriter, while this one will be executed by the startup
+ * process.)
  */
 static void
 RecoveryRestartPoint(const CheckPoint *checkPoint)
@@ -6640,12 +6740,14 @@ RecoveryRestartPoint(const CheckPoint *checkPoint)
 }
 
 /*
+ * Establish a restartpoint if possible.
+ *
  * This is similar to CreateCheckPoint, but is used during WAL recovery
  * to establish a point from which recovery can roll forward without
  * replaying the entire recovery log.
  *
  * Returns true if a new restartpoint was established. We can only establish
- * a restartpoint if we have replayed a checkpoint record since last
+ * a restartpoint if we have replayed a safe checkpoint record since last
  * restartpoint.
  */
 bool
@@ -6663,7 +6765,7 @@ CreateRestartPoint(int flags)
 	 */
 	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
 
-	/* Get the a local copy of the last checkpoint record. */
+	/* Get a local copy of the last safe checkpoint record. */
 	SpinLockAcquire(&xlogctl->info_lck);
 	lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
 	memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
@@ -6723,14 +6825,21 @@ CreateRestartPoint(int flags)
 	CheckPointGuts(lastCheckPoint.redo, flags);
 
 	/*
-	 * Update pg_control, using current time
+	 * Update pg_control, using current time.  Check that it still shows
+	 * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
+	 * this is a quick hack to make sure nothing really bad happens if
+	 * somehow we get here after the end-of-recovery checkpoint.
 	 */
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-	ControlFile->prevCheckPoint = ControlFile->checkPoint;
-	ControlFile->checkPoint = lastCheckPointRecPtr;
-	ControlFile->checkPointCopy = lastCheckPoint;
-	ControlFile->time = (pg_time_t) time(NULL);
-	UpdateControlFile();
+	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
+		XLByteLT(ControlFile->checkPointCopy.redo, lastCheckPoint.redo))
+	{
+		ControlFile->prevCheckPoint = ControlFile->checkPoint;
+		ControlFile->checkPoint = lastCheckPointRecPtr;
+		ControlFile->checkPointCopy = lastCheckPoint;
+		ControlFile->time = (pg_time_t) time(NULL);
+		UpdateControlFile();
+	}
 	LWLockRelease(ControlFileLock);
 
 	/*
@@ -6747,6 +6856,7 @@ CreateRestartPoint(int flags)
 			(errmsg("recovery restart point at %X/%X",
 				  lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
 
+	/* XXX this is currently BROKEN because we are in the wrong process */
 	if (recoveryLastXTime)
 		ereport((log_checkpoints ? LOG : DEBUG2),
 				(errmsg("last completed transaction was at log time %s",
@@ -6821,7 +6931,7 @@ RequestXLogSwitch(void)
  * XLOG resource manager's routines
  *
  * Definitions of info values are in include/catalog/pg_control.h, though
- * not all records types are related to control file processing.
+ * not all record types are related to control file updates.
  */
 void
 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index b5fd31532e6..831ea9478a7 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -19,7 +19,8 @@
  * condition.)
  *
  * The bgwriter is started by the postmaster as soon as the startup subprocess
- * finishes.  It remains alive until the postmaster commands it to terminate.
+ * finishes, or as soon as recovery begins if we are doing archive recovery.
+ * It remains alive until the postmaster commands it to terminate.
  * Normal termination is by SIGUSR2, which instructs the bgwriter to execute
  * a shutdown checkpoint and then exit(0).	(All backends must be stopped
  * before SIGUSR2 is issued!)  Emergency termination is by SIGQUIT; like any
@@ -37,7 +38,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.61 2009/06/25 21:36:00 heikki Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.62 2009/06/26 20:29:04 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -902,11 +903,11 @@ BgWriterShmemInit(void)
  *
  * flags is a bitwise OR of the following:
  *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
- *	CHECKPOINT_END_OF_RECOVERY: checkpoint is to finish WAL recovery.
+ *	CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
  *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
  *		ignoring checkpoint_completion_target parameter.
  *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
- *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN and
+ *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
  *		CHECKPOINT_END_OF_RECOVERY).
  *	CHECKPOINT_WAIT: wait for completion before returning (otherwise,
  *		just signal bgwriter to do it, and return).
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index c9b0e0ab2ec..3dbf36a6cf1 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -37,7 +37,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.582 2009/06/11 14:49:01 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.583 2009/06/26 20:29:04 tgl Exp $
  *
  * NOTES
  *
@@ -227,21 +227,22 @@ static bool RecoveryError = false;		/* T if WAL recovery failed */
  *
  * After doing all the postmaster initialization work, we enter PM_STARTUP
  * state and the startup process is launched. The startup process begins by
- * reading the control file and other preliminary initialization steps. When
- * it's ready to start WAL redo, it signals postmaster, and we switch to
- * PM_RECOVERY phase. The background writer is launched, while the startup
- * process continues applying WAL.
+ * reading the control file and other preliminary initialization steps.
+ * In a normal startup, or after crash recovery, the startup process exits
+ * with exit code 0 and we switch to PM_RUN state.  However, archive recovery
+ * is handled specially since it takes much longer and we would like to support
+ * hot standby during archive recovery.
  *
+ * When the startup process is ready to start archive recovery, it signals the
+ * postmaster, and we switch to PM_RECOVERY state. The background writer is
+ * launched, while the startup process continues applying WAL.
  * After reaching a consistent point in WAL redo, startup process signals
- * us again, and we switch to PM_RECOVERY_CONSISTENT phase. There's currently
+ * us again, and we switch to PM_RECOVERY_CONSISTENT state. There's currently
  * no difference between PM_RECOVERY and PM_RECOVERY_CONSISTENT, but we
  * could start accepting connections to perform read-only queries at this
  * point, if we had the infrastructure to do that.
- *
- * When WAL redo is finished, the startup process exits with exit code 0
- * and we switch to PM_RUN state. Startup process can also skip the
- * recovery and consistent recovery phases altogether, as it will during
- * normal startup when there's no recovery to be done, for example.
+ * When archive recovery is finished, the startup process exits with exit
+ * code 0 and we switch to PM_RUN state.
  *
  * Normal child backends can only be launched when we are in PM_RUN state.
  * (We also allow it in PM_WAIT_BACKUP state, but only for superusers.)
@@ -269,7 +270,7 @@ typedef enum
 {
 	PM_INIT,					/* postmaster starting */
 	PM_STARTUP,					/* waiting for startup subprocess */
-	PM_RECOVERY,				/* in recovery mode */
+	PM_RECOVERY,				/* in archive recovery mode */
 	PM_RECOVERY_CONSISTENT,		/* consistent recovery mode */
 	PM_RUN,						/* normal "database is alive" state */
 	PM_WAIT_BACKUP,				/* waiting for online backup mode to end */
@@ -2195,8 +2196,8 @@ reaper(SIGNAL_ARGS)
 
 			/*
 			 * Unexpected exit of startup process (including FATAL exit)
-			 * during PM_STARTUP is treated as catastrophic. There is no other
-			 * processes running yet, so we can just exit.
+			 * during PM_STARTUP is treated as catastrophic. There are no
+			 * other processes running yet, so we can just exit.
 			 */
 			if (pmState == PM_STARTUP && !EXIT_STATUS_0(exitstatus))
 			{
@@ -2247,7 +2248,7 @@ reaper(SIGNAL_ARGS)
 
 			/*
 			 * Crank up the background writer, if we didn't do that already
-			 * when we entered consistent recovery phase.  It doesn't matter
+			 * when we entered consistent recovery state.  It doesn't matter
 			 * if this fails, we'll just try again later.
 			 */
 			if (BgWriterPID == 0)
@@ -4008,7 +4009,7 @@ sigusr1_handler(SIGNAL_ARGS)
 		/*
 		 * Load the flat authorization file into postmaster's cache. The
 		 * startup process won't have recomputed this from the database yet,
-		 * so we it may change following recovery.
+		 * so it may change following recovery.
 		 */
 		load_role();
 
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 18402a6ad61..0c4861d6dbb 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.147 2009/06/25 21:36:00 heikki Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.148 2009/06/26 20:29:04 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -204,10 +204,10 @@ mdinit(void)
 }
 
 /*
- * In archive recovery, we rely on bgwriter to do fsyncs(), but we don't
- * know that we do archive recovery at process startup when pendingOpsTable
- * has already been created. Calling this function drops pendingOpsTable
- * and causes any subsequent requests to be forwarded to bgwriter.
+ * In archive recovery, we rely on bgwriter to do fsyncs, but we will have
+ * already created the pendingOpsTable during initialization of the startup
+ * process.  Calling this function drops the local pendingOpsTable so that
+ * subsequent requests will be forwarded to bgwriter.
  */
 void
 SetForwardFsyncRequests(void)
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index ea9e232a08c..052a314d74e 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.92 2009/06/25 21:36:00 heikki Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.93 2009/06/26 20:29:04 tgl Exp $
  */
 #ifndef XLOG_H
 #define XLOG_H
@@ -159,15 +159,15 @@ extern bool XLOG_DEBUG;
 
 /* These directly affect the behavior of CreateCheckPoint and subsidiaries */
 #define CHECKPOINT_IS_SHUTDOWN	0x0001	/* Checkpoint is for shutdown */
-#define CHECKPOINT_IMMEDIATE	0x0002	/* Do it without delays */
-#define CHECKPOINT_FORCE		0x0004	/* Force even if no activity */
+#define CHECKPOINT_END_OF_RECOVERY	0x0002	/* Like shutdown checkpoint, but
+											 * issued at end of WAL recovery */
+#define CHECKPOINT_IMMEDIATE	0x0004	/* Do it without delays */
+#define CHECKPOINT_FORCE		0x0008	/* Force even if no activity */
 /* These are important to RequestCheckpoint */
-#define CHECKPOINT_WAIT			0x0008	/* Wait for completion */
+#define CHECKPOINT_WAIT			0x0010	/* Wait for completion */
 /* These indicate the cause of a checkpoint request */
-#define CHECKPOINT_CAUSE_XLOG	0x0010	/* XLOG consumption */
-#define CHECKPOINT_CAUSE_TIME	0x0020	/* Elapsed time */
-#define CHECKPOINT_END_OF_RECOVERY	0x0040	/* Like shutdown checkpoint, but
-											 * issued at end of WAL recovery */
+#define CHECKPOINT_CAUSE_XLOG	0x0020	/* XLOG consumption */
+#define CHECKPOINT_CAUSE_TIME	0x0040	/* Elapsed time */
 
 /* Checkpoint statistics */
 typedef struct CheckpointStatsData
@@ -202,6 +202,7 @@ extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
 extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
 
 extern bool RecoveryInProgress(void);
+extern bool XLogInsertAllowed(void);
 
 extern void UpdateControlFile(void);
 extern Size XLOGShmemSize(void);