aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/access/transam/multixact.c4
-rw-r--r--src/backend/access/transam/xlog.c302
-rw-r--r--src/backend/postmaster/bgwriter.c9
-rw-r--r--src/backend/postmaster/postmaster.c33
-rw-r--r--src/backend/storage/smgr/md.c10
-rw-r--r--src/include/access/xlog.h17
6 files changed, 244 insertions, 131 deletions
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 4888b0b36b9..dda3d03b149 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -42,7 +42,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.30 2009/01/20 18:59:37 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.31 2009/06/26 20:29:04 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -1543,7 +1543,7 @@ CheckPointMultiXact(void)
* SimpleLruTruncate would get confused. It seems best not to risk
* removing any data during recovery anyway, so don't truncate.
*/
- if (!InRecovery)
+ if (!RecoveryInProgress())
TruncateMultiXact();
TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 401d805a8f5..5990bae8b84 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.344 2009/06/25 21:36:00 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.345 2009/06/26 20:29:04 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -124,24 +124,36 @@ TimeLineID ThisTimeLineID = 0;
/*
* Are we doing recovery from XLOG?
*
- * This is only ever true in the startup process, even if the system is still
- * in recovery. Prior to 8.4, all activity during recovery were carried out
- * by Startup process. This local variable continues to be used in functions
- * that need to act differently when called from a redo function (e.g skip
- * WAL logging). To check whether the system is in recovery regardless of what
+ * This is only ever true in the startup process; it should be read as meaning
+ * "this process is replaying WAL records", rather than "the system is in
+ * recovery mode". It should be examined primarily by functions that need
+ * to act differently when called from a WAL redo function (e.g., to skip WAL
+ * logging). To check whether the system is in recovery regardless of which
* process you're running in, use RecoveryInProgress().
*/
bool InRecovery = false;
-/* Are we recovering using offline XLOG archives? */
-static bool InArchiveRecovery = false;
-
/*
* Local copy of SharedRecoveryInProgress variable. True actually means "not
- * known, need to check the shared state"
+ * known, need to check the shared state".
*/
static bool LocalRecoveryInProgress = true;
+/*
+ * Local state for XLogInsertAllowed():
+ * 1: unconditionally allowed to insert XLOG
+ * 0: unconditionally not allowed to insert XLOG
+ * -1: must check RecoveryInProgress(); disallow until it is false
+ * Most processes start with -1 and transition to 1 after seeing that recovery
+ * is not in progress. But we can also force the value for special cases.
+ * The coding in XLogInsertAllowed() depends on the first two of these states
+ * being numerically the same as bool true and false.
+ */
+static int LocalXLogInsertAllowed = -1;
+
+/* Are we recovering using offline XLOG archives? */
+static bool InArchiveRecovery = false;
+
/* Was the last xlog file restored from archive, or local? */
static bool restoredFromArchive = false;
@@ -260,7 +272,8 @@ static XLogRecPtr RedoRecPtr;
* new log file.
*
* CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
- * only one checkpointer at a time)
+ * only one checkpointer at a time; currently, with all checkpoints done by
+ * the bgwriter, this is just pro forma).
*
*----------
*/
@@ -331,7 +344,7 @@ typedef struct XLogCtlData
/*
* SharedRecoveryInProgress indicates if we're still in crash or archive
- * recovery. It's checked by RecoveryInProgress().
+ * recovery. Protected by info_lck.
*/
bool SharedRecoveryInProgress;
@@ -421,6 +434,7 @@ static XLogRecPtr ReadRecPtr; /* start of last record read */
static XLogRecPtr EndRecPtr; /* end+1 of last record read */
static XLogRecord *nextRecord = NULL;
static TimeLineID lastPageTLI = 0;
+
static XLogRecPtr minRecoveryPoint; /* local copy of
* ControlFile->minRecoveryPoint */
static bool updateMinRecoveryPoint = true;
@@ -428,7 +442,7 @@ static bool updateMinRecoveryPoint = true;
static bool InRedo = false;
/*
- * Flag set by interrupt handlers for later service in the redo loop.
+ * Flags set by interrupt handlers for later service in the redo loop.
*/
static volatile sig_atomic_t got_SIGHUP = false;
static volatile sig_atomic_t shutdown_requested = false;
@@ -537,8 +551,8 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
/* cross-check on whether we should be here or not */
- if (RecoveryInProgress())
- elog(FATAL, "cannot make new WAL entries during recovery");
+ if (!XLogInsertAllowed())
+ elog(ERROR, "cannot make new WAL entries during recovery");
/* info's high bits are reserved for use by me */
if (info & XLR_INFO_MASK)
@@ -1780,7 +1794,7 @@ XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
* database is consistent.
*
* If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
- * is is only updated if it's not already greater than or equal to 'lsn'.
+ * is only updated if it's not already greater than or equal to 'lsn'.
*/
static void
UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
@@ -1796,7 +1810,8 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
/*
* An invalid minRecoveryPoint means that we need to recover all the WAL,
- * ie. crash recovery. Don't update the control file in that case.
+ * i.e., we're doing crash recovery. We never modify the control file's
+ * value in that case, so we can short-circuit future checks here too.
*/
if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
updateMinRecoveryPoint = false;
@@ -1809,12 +1824,26 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
/*
* To avoid having to update the control file too often, we update it
* all the way to the last record being replayed, even though 'lsn'
- * would suffice for correctness.
+ * would suffice for correctness. This also allows the 'force' case
+ * to not need a valid 'lsn' value.
+ *
+ * Another important reason for doing it this way is that the passed
+ * 'lsn' value could be bogus, i.e., past the end of available WAL,
+ * if the caller got it from a corrupted heap page. Accepting such
+ * a value as the min recovery point would prevent us from coming up
+ * at all. Instead, we just log a warning and continue with recovery.
+ * (See also the comments about corrupt LSNs in XLogFlush.)
*/
SpinLockAcquire(&xlogctl->info_lck);
newMinRecoveryPoint = xlogctl->replayEndRecPtr;
SpinLockRelease(&xlogctl->info_lck);
+ if (!force && XLByteLT(newMinRecoveryPoint, lsn))
+ elog(WARNING,
+ "xlog min recovery request %X/%X is past current point %X/%X",
+ lsn.xlogid, lsn.xrecoff,
+ newMinRecoveryPoint.xlogid, newMinRecoveryPoint.xrecoff);
+
/* update control file */
if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
{
@@ -1843,10 +1872,13 @@ XLogFlush(XLogRecPtr record)
XLogwrtRqst WriteRqst;
/*
- * During REDO, we don't try to flush the WAL, but update minRecoveryPoint
- * instead.
+ * During REDO, we are reading not writing WAL. Therefore, instead of
+ * trying to flush the WAL, we should update minRecoveryPoint instead.
+ * We test XLogInsertAllowed(), not InRecovery, because we need the
+ * bgwriter to act this way too, and because when the bgwriter tries
+ * to write the end-of-recovery checkpoint, it should indeed flush.
*/
- if (RecoveryInProgress())
+ if (!XLogInsertAllowed())
{
UpdateMinRecoveryPoint(record, false);
return;
@@ -1935,21 +1967,20 @@ XLogFlush(XLogRecPtr record)
* system's robustness rather than helping it: we do not want to take down
* the whole system due to corruption on one data page. In particular, if
* the bad page is encountered again during recovery then we would be
- * unable to restart the database at all! (This scenario has actually
- * happened in the field several times with 7.1 releases. Note that we
- * cannot get here while RecoveryInProgress(), but if the bad page is
- * brought in and marked dirty during recovery then if a checkpoint were
- * performed at the end of recovery it will try to flush it.
+ * unable to restart the database at all! (This scenario actually
+ * happened in the field several times with 7.1 releases.) As of 8.4,
+ * bad LSNs encountered during recovery are UpdateMinRecoveryPoint's
+ * problem; the only time we can reach here during recovery is while
+ * flushing the end-of-recovery checkpoint record, and we don't expect
+ * that to have a bad LSN.
*
- * The current approach is to ERROR under normal conditions, but only
- * WARNING during recovery, so that the system can be brought up even if
- * there's a corrupt LSN. Note that for calls from xact.c, the ERROR will
+ * Note that for calls from xact.c, the ERROR will
* be promoted to PANIC since xact.c calls this routine inside a critical
* section. However, calls from bufmgr.c are not within critical sections
* and so we will not force a restart for a bad LSN on a data page.
*/
if (XLByteLT(LogwrtResult.Flush, record))
- elog(InRecovery ? WARNING : ERROR,
+ elog(ERROR,
"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
record.xlogid, record.xrecoff,
LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
@@ -2751,7 +2782,7 @@ RestoreArchivedFile(char *path, const char *xlogfname,
/*
* Set in_restore_command to tell the signal handler that we should exit
- * right away on SIGTERM. We know that we're in a safe point to do that.
+ * right away on SIGTERM. We know that we're at a safe point to do that.
* Check if we had already received the signal, so that we don't miss a
* shutdown request received just before this.
*/
@@ -2833,7 +2864,7 @@ RestoreArchivedFile(char *path, const char *xlogfname,
* problems such as an unfindable command; treat those as fatal errors
* too.
*/
- if (WTERMSIG(rc) == SIGTERM)
+ if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
proc_exit(1);
signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
@@ -4543,6 +4574,7 @@ XLOGShmemInit(void)
* in additional info.)
*/
XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
+ XLogCtl->SharedRecoveryInProgress = true;
XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
SpinLockInit(&XLogCtl->info_lck);
@@ -5164,8 +5196,6 @@ StartupXLOG(void)
TransactionId oldestActiveXID;
bool bgwriterLaunched = false;
- XLogCtl->SharedRecoveryInProgress = true;
-
/*
* Read control file and check XLOG status looks valid.
*
@@ -5392,7 +5422,7 @@ StartupXLOG(void)
/* No need to hold ControlFileLock yet, we aren't up far enough */
UpdateControlFile();
- /* update our local copy of minRecoveryPoint */
+ /* initialize our local copy of minRecoveryPoint */
minRecoveryPoint = ControlFile->minRecoveryPoint;
/*
@@ -5450,7 +5480,7 @@ StartupXLOG(void)
/* use volatile pointer to prevent code rearrangement */
volatile XLogCtlData *xlogctl = XLogCtl;
- /* Update shared replayEndRecPtr */
+ /* initialize shared replayEndRecPtr */
SpinLockAcquire(&xlogctl->info_lck);
xlogctl->replayEndRecPtr = ReadRecPtr;
SpinLockRelease(&xlogctl->info_lck);
@@ -5476,7 +5506,8 @@ StartupXLOG(void)
* recovering after crash.
*
* After this point, we can no longer assume that we're the only
- * process in addition to postmaster!
+ * process in addition to postmaster! Also, fsync requests are
+ * subsequently to be handled by the bgwriter, not locally.
*/
if (InArchiveRecovery && IsUnderPostmaster)
{
@@ -5526,11 +5557,11 @@ StartupXLOG(void)
proc_exit(1);
/*
- * Have we reached our safe starting point? If so, we can tell
+ * Have we passed our safe starting point? If so, we can tell
* postmaster that the database is consistent now.
*/
if (!reachedMinRecoveryPoint &&
- XLByteLE(minRecoveryPoint, EndRecPtr))
+ XLByteLT(minRecoveryPoint, EndRecPtr))
{
reachedMinRecoveryPoint = true;
if (InArchiveRecovery)
@@ -5616,7 +5647,10 @@ StartupXLOG(void)
/*
* Complain if we did not roll forward far enough to render the backup
- * dump consistent.
+ * dump consistent. Note: it is indeed okay to look at the local variable
+ * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
+ * be further ahead --- ControlFile->minRecoveryPoint cannot have been
+ * advanced beyond the WAL we processed.
*/
if (InRecovery && XLByteLT(EndOfLog, minRecoveryPoint))
{
@@ -5816,14 +5850,27 @@ StartupXLOG(void)
}
/*
- * All done. Allow backends to write WAL.
+ * All done. Allow backends to write WAL. (Although the bool flag is
+ * probably atomic in itself, we use the info_lck here to ensure that
+ * there are no race conditions concerning visibility of other recent
+ * updates to shared memory.)
*/
- XLogCtl->SharedRecoveryInProgress = false;
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ SpinLockAcquire(&xlogctl->info_lck);
+ xlogctl->SharedRecoveryInProgress = false;
+ SpinLockRelease(&xlogctl->info_lck);
+ }
}
/*
* Is the system still in recovery?
*
+ * Unlike testing InRecovery, this works in any process that's connected to
+ * shared memory.
+ *
* As a side-effect, we initialize the local TimeLineID and RedoRecPtr
* variables the first time we see that recovery is finished.
*/
@@ -5831,9 +5878,9 @@ bool
RecoveryInProgress(void)
{
/*
- * We check shared state each time only until we leave recovery mode. We
- * can't re-enter recovery, so we rely on the local state variable after
- * that.
+ * We check shared state each time only until we leave recovery mode.
+ * We can't re-enter recovery, so there's no need to keep checking after
+ * the shared variable has once been seen false.
*/
if (!LocalRecoveryInProgress)
return false;
@@ -5842,11 +5889,15 @@ RecoveryInProgress(void)
/* use volatile pointer to prevent code rearrangement */
volatile XLogCtlData *xlogctl = XLogCtl;
+ /* spinlock is essential on machines with weak memory ordering! */
+ SpinLockAcquire(&xlogctl->info_lck);
LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
+ SpinLockRelease(&xlogctl->info_lck);
/*
- * Initialize TimeLineID and RedoRecPtr the first time we see that
- * recovery is finished.
+ * Initialize TimeLineID and RedoRecPtr when we discover that recovery
+ * is finished. (If you change this, see also
+ * LocalSetXLogInsertAllowed.)
*/
if (!LocalRecoveryInProgress)
InitXLOGAccess();
@@ -5856,6 +5907,51 @@ RecoveryInProgress(void)
}
/*
+ * Is this process allowed to insert new WAL records?
+ *
+ * Ordinarily this is essentially equivalent to !RecoveryInProgress().
+ * But we also have provisions for forcing the result "true" or "false"
+ * within specific processes regardless of the global state.
+ */
+bool
+XLogInsertAllowed(void)
+{
+ /*
+ * If value is "unconditionally true" or "unconditionally false",
+ * just return it. This provides the normal fast path once recovery
+ * is known done.
+ */
+ if (LocalXLogInsertAllowed >= 0)
+ return (bool) LocalXLogInsertAllowed;
+
+ /*
+ * Else, must check to see if we're still in recovery.
+ */
+ if (RecoveryInProgress())
+ return false;
+
+ /*
+ * On exit from recovery, reset to "unconditionally true", since there
+ * is no need to keep checking.
+ */
+ LocalXLogInsertAllowed = 1;
+ return true;
+}
+
+/*
+ * Make XLogInsertAllowed() return true in the current process only.
+ */
+static void
+LocalSetXLogInsertAllowed(void)
+{
+ Assert(LocalXLogInsertAllowed == -1);
+ LocalXLogInsertAllowed = 1;
+
+ /* Initialize as RecoveryInProgress() would do when switching state */
+ InitXLOGAccess();
+}
+
+/*
* Subroutine to try to fetch and validate a prior checkpoint record.
*
* whichChkpt identifies the checkpoint (merely for reporting purposes).
@@ -6126,7 +6222,7 @@ ShutdownXLOG(int code, Datum arg)
static void
LogCheckpointStart(int flags, bool restartpoint)
{
- char *msg;
+ const char *msg;
/*
* XXX: This is hopelessly untranslatable. We could call gettext_noop for
@@ -6205,7 +6301,7 @@ LogCheckpointEnd(bool restartpoint)
* CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
* ignoring checkpoint_completion_target parameter.
* CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
- * since the last one (implied by CHECKPOINT_IS_SHUTDOWN and
+ * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
* CHECKPOINT_END_OF_RECOVERY).
*
* Note: flags contains other bits, of interest here only for logging purposes.
@@ -6225,44 +6321,19 @@ CreateCheckPoint(int flags)
uint32 _logSeg;
TransactionId *inCommitXids;
int nInCommit;
- bool OldInRecovery = InRecovery;
/*
* An end-of-recovery checkpoint is really a shutdown checkpoint, just
* issued at a different time.
*/
- if (flags & ((CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY) != 0))
+ if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
shutdown = true;
else
shutdown = false;
- /*
- * A startup checkpoint is created before anyone else is allowed to
- * write WAL. To allow us to write the checkpoint record, set
- * LocalRecoveryInProgress to false. This lets us write WAL, but others
- * are still not allowed to do so.
- */
- if (flags & CHECKPOINT_END_OF_RECOVERY)
- {
- Assert(RecoveryInProgress());
- LocalRecoveryInProgress = false;
- InitXLOGAccess();
-
- /*
- * Before 8.4, end-of-recovery checkpoints were always performed by
- * the startup process, and InRecovery was set true. InRecovery is not
- * normally set in bgwriter, but we set it here temporarily to avoid
- * confusing old code in the end-of-recovery checkpoint code path that
- * rely on it.
- */
- InRecovery = true;
- }
- else
- {
- /* shouldn't happen */
- if (RecoveryInProgress())
- elog(ERROR, "can't create a checkpoint during recovery");
- }
+ /* sanity check */
+ if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
+ elog(ERROR, "can't create a checkpoint during recovery");
/*
* Acquire CheckpointLock to ensure only one checkpoint happens at a time.
@@ -6305,7 +6376,6 @@ CreateCheckPoint(int flags)
/* Begin filling in the checkpoint WAL record */
MemSet(&checkPoint, 0, sizeof(checkPoint));
- checkPoint.ThisTimeLineID = ThisTimeLineID;
checkPoint.time = (pg_time_t) time(NULL);
/*
@@ -6473,6 +6543,20 @@ CreateCheckPoint(int flags)
START_CRIT_SECTION();
/*
+ * An end-of-recovery checkpoint is created before anyone is allowed to
+ * write WAL. To allow us to write the checkpoint record, temporarily
+ * enable XLogInsertAllowed.
+ */
+ if (flags & CHECKPOINT_END_OF_RECOVERY)
+ LocalSetXLogInsertAllowed();
+
+ /*
+ * This needs to be done after LocalSetXLogInsertAllowed(), else
+ * ThisTimeLineID might still be uninitialized.
+ */
+ checkPoint.ThisTimeLineID = ThisTimeLineID;
+
+ /*
* Now insert the checkpoint record into XLOG.
*/
rdata.data = (char *) (&checkPoint);
@@ -6488,6 +6572,21 @@ CreateCheckPoint(int flags)
XLogFlush(recptr);
/*
+ * We mustn't write any new WAL after a shutdown checkpoint, or it will
+ * be overwritten at next startup. No-one should even try, this just
+ * allows sanity-checking. In the case of an end-of-recovery checkpoint,
+ * we want to just temporarily disable writing until the system has exited
+ * recovery.
+ */
+ if (shutdown)
+ {
+ if (flags & CHECKPOINT_END_OF_RECOVERY)
+ LocalXLogInsertAllowed = -1; /* return to "check" state */
+ else
+ LocalXLogInsertAllowed = 0; /* never again write WAL */
+ }
+
+ /*
* We now have ProcLastRecPtr = start of actual checkpoint record, recptr
* = end of actual checkpoint record.
*/
@@ -6560,7 +6659,7 @@ CreateCheckPoint(int flags)
* in subtrans.c). During recovery, though, we mustn't do this because
* StartupSUBTRANS hasn't been called yet.
*/
- if (!InRecovery)
+ if (!RecoveryInProgress())
TruncateSUBTRANS(GetOldestXmin(true, false));
/* All real work is done, but log before releasing lock. */
@@ -6574,9 +6673,6 @@ CreateCheckPoint(int flags)
CheckpointStats.ckpt_segs_recycled);
LWLockRelease(CheckpointLock);
-
- /* Restore old value */
- InRecovery = OldInRecovery;
}
/*
@@ -6597,10 +6693,14 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
}
/*
- * This is used during WAL recovery to establish a point from which recovery
- * can roll forward without replaying the entire recovery log. This function
- * is called each time a checkpoint record is read from XLOG. It is stored
- * in shared memory, so that it can be used as a restartpoint later on.
+ * Save a checkpoint for recovery restart if appropriate
+ *
+ * This function is called each time a checkpoint record is read from XLOG.
+ * It must determine whether the checkpoint represents a safe restartpoint or
+ * not. If so, the checkpoint record is stashed in shared memory so that
+ * CreateRestartPoint can consult it. (Note that the latter function is
+ * executed by the bgwriter, while this one will be executed by the startup
+ * process.)
*/
static void
RecoveryRestartPoint(const CheckPoint *checkPoint)
@@ -6640,12 +6740,14 @@ RecoveryRestartPoint(const CheckPoint *checkPoint)
}
/*
+ * Establish a restartpoint if possible.
+ *
* This is similar to CreateCheckPoint, but is used during WAL recovery
* to establish a point from which recovery can roll forward without
* replaying the entire recovery log.
*
* Returns true if a new restartpoint was established. We can only establish
- * a restartpoint if we have replayed a checkpoint record since last
+ * a restartpoint if we have replayed a safe checkpoint record since last
* restartpoint.
*/
bool
@@ -6663,7 +6765,7 @@ CreateRestartPoint(int flags)
*/
LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
- /* Get the a local copy of the last checkpoint record. */
+ /* Get a local copy of the last safe checkpoint record. */
SpinLockAcquire(&xlogctl->info_lck);
lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
@@ -6723,14 +6825,21 @@ CreateRestartPoint(int flags)
CheckPointGuts(lastCheckPoint.redo, flags);
/*
- * Update pg_control, using current time
+ * Update pg_control, using current time. Check that it still shows
+ * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
+ * this is a quick hack to make sure nothing really bad happens if
+ * somehow we get here after the end-of-recovery checkpoint.
*/
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
- ControlFile->prevCheckPoint = ControlFile->checkPoint;
- ControlFile->checkPoint = lastCheckPointRecPtr;
- ControlFile->checkPointCopy = lastCheckPoint;
- ControlFile->time = (pg_time_t) time(NULL);
- UpdateControlFile();
+ if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
+ XLByteLT(ControlFile->checkPointCopy.redo, lastCheckPoint.redo))
+ {
+ ControlFile->prevCheckPoint = ControlFile->checkPoint;
+ ControlFile->checkPoint = lastCheckPointRecPtr;
+ ControlFile->checkPointCopy = lastCheckPoint;
+ ControlFile->time = (pg_time_t) time(NULL);
+ UpdateControlFile();
+ }
LWLockRelease(ControlFileLock);
/*
@@ -6747,6 +6856,7 @@ CreateRestartPoint(int flags)
(errmsg("recovery restart point at %X/%X",
lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
+ /* XXX this is currently BROKEN because we are in the wrong process */
if (recoveryLastXTime)
ereport((log_checkpoints ? LOG : DEBUG2),
(errmsg("last completed transaction was at log time %s",
@@ -6821,7 +6931,7 @@ RequestXLogSwitch(void)
* XLOG resource manager's routines
*
* Definitions of info values are in include/catalog/pg_control.h, though
- * not all records types are related to control file processing.
+ * not all record types are related to control file updates.
*/
void
xlog_redo(XLogRecPtr lsn, XLogRecord *record)
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index b5fd31532e6..831ea9478a7 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -19,7 +19,8 @@
* condition.)
*
* The bgwriter is started by the postmaster as soon as the startup subprocess
- * finishes. It remains alive until the postmaster commands it to terminate.
+ * finishes, or as soon as recovery begins if we are doing archive recovery.
+ * It remains alive until the postmaster commands it to terminate.
* Normal termination is by SIGUSR2, which instructs the bgwriter to execute
* a shutdown checkpoint and then exit(0). (All backends must be stopped
* before SIGUSR2 is issued!) Emergency termination is by SIGQUIT; like any
@@ -37,7 +38,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.61 2009/06/25 21:36:00 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.62 2009/06/26 20:29:04 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -902,11 +903,11 @@ BgWriterShmemInit(void)
*
* flags is a bitwise OR of the following:
* CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
- * CHECKPOINT_END_OF_RECOVERY: checkpoint is to finish WAL recovery.
+ * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
* CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
* ignoring checkpoint_completion_target parameter.
* CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
- * since the last one (implied by CHECKPOINT_IS_SHUTDOWN and
+ * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
* CHECKPOINT_END_OF_RECOVERY).
* CHECKPOINT_WAIT: wait for completion before returning (otherwise,
* just signal bgwriter to do it, and return).
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index c9b0e0ab2ec..3dbf36a6cf1 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -37,7 +37,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.582 2009/06/11 14:49:01 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.583 2009/06/26 20:29:04 tgl Exp $
*
* NOTES
*
@@ -227,21 +227,22 @@ static bool RecoveryError = false; /* T if WAL recovery failed */
*
* After doing all the postmaster initialization work, we enter PM_STARTUP
* state and the startup process is launched. The startup process begins by
- * reading the control file and other preliminary initialization steps. When
- * it's ready to start WAL redo, it signals postmaster, and we switch to
- * PM_RECOVERY phase. The background writer is launched, while the startup
- * process continues applying WAL.
+ * reading the control file and other preliminary initialization steps.
+ * In a normal startup, or after crash recovery, the startup process exits
+ * with exit code 0 and we switch to PM_RUN state. However, archive recovery
+ * is handled specially since it takes much longer and we would like to support
+ * hot standby during archive recovery.
*
+ * When the startup process is ready to start archive recovery, it signals the
+ * postmaster, and we switch to PM_RECOVERY state. The background writer is
+ * launched, while the startup process continues applying WAL.
* After reaching a consistent point in WAL redo, startup process signals
- * us again, and we switch to PM_RECOVERY_CONSISTENT phase. There's currently
+ * us again, and we switch to PM_RECOVERY_CONSISTENT state. There's currently
* no difference between PM_RECOVERY and PM_RECOVERY_CONSISTENT, but we
* could start accepting connections to perform read-only queries at this
* point, if we had the infrastructure to do that.
- *
- * When WAL redo is finished, the startup process exits with exit code 0
- * and we switch to PM_RUN state. Startup process can also skip the
- * recovery and consistent recovery phases altogether, as it will during
- * normal startup when there's no recovery to be done, for example.
+ * When archive recovery is finished, the startup process exits with exit
+ * code 0 and we switch to PM_RUN state.
*
* Normal child backends can only be launched when we are in PM_RUN state.
* (We also allow it in PM_WAIT_BACKUP state, but only for superusers.)
@@ -269,7 +270,7 @@ typedef enum
{
PM_INIT, /* postmaster starting */
PM_STARTUP, /* waiting for startup subprocess */
- PM_RECOVERY, /* in recovery mode */
+ PM_RECOVERY, /* in archive recovery mode */
PM_RECOVERY_CONSISTENT, /* consistent recovery mode */
PM_RUN, /* normal "database is alive" state */
PM_WAIT_BACKUP, /* waiting for online backup mode to end */
@@ -2195,8 +2196,8 @@ reaper(SIGNAL_ARGS)
/*
* Unexpected exit of startup process (including FATAL exit)
- * during PM_STARTUP is treated as catastrophic. There is no other
- * processes running yet, so we can just exit.
+ * during PM_STARTUP is treated as catastrophic. There are no
+ * other processes running yet, so we can just exit.
*/
if (pmState == PM_STARTUP && !EXIT_STATUS_0(exitstatus))
{
@@ -2247,7 +2248,7 @@ reaper(SIGNAL_ARGS)
/*
* Crank up the background writer, if we didn't do that already
- * when we entered consistent recovery phase. It doesn't matter
+ * when we entered consistent recovery state. It doesn't matter
* if this fails, we'll just try again later.
*/
if (BgWriterPID == 0)
@@ -4008,7 +4009,7 @@ sigusr1_handler(SIGNAL_ARGS)
/*
* Load the flat authorization file into postmaster's cache. The
* startup process won't have recomputed this from the database yet,
- * so we it may change following recovery.
+ * so it may change following recovery.
*/
load_role();
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 18402a6ad61..0c4861d6dbb 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.147 2009/06/25 21:36:00 heikki Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.148 2009/06/26 20:29:04 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -204,10 +204,10 @@ mdinit(void)
}
/*
- * In archive recovery, we rely on bgwriter to do fsyncs(), but we don't
- * know that we do archive recovery at process startup when pendingOpsTable
- * has already been created. Calling this function drops pendingOpsTable
- * and causes any subsequent requests to be forwarded to bgwriter.
+ * In archive recovery, we rely on bgwriter to do fsyncs, but we will have
+ * already created the pendingOpsTable during initialization of the startup
+ * process. Calling this function drops the local pendingOpsTable so that
+ * subsequent requests will be forwarded to bgwriter.
*/
void
SetForwardFsyncRequests(void)
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index ea9e232a08c..052a314d74e 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -6,7 +6,7 @@
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.92 2009/06/25 21:36:00 heikki Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.93 2009/06/26 20:29:04 tgl Exp $
*/
#ifndef XLOG_H
#define XLOG_H
@@ -159,15 +159,15 @@ extern bool XLOG_DEBUG;
/* These directly affect the behavior of CreateCheckPoint and subsidiaries */
#define CHECKPOINT_IS_SHUTDOWN 0x0001 /* Checkpoint is for shutdown */
-#define CHECKPOINT_IMMEDIATE 0x0002 /* Do it without delays */
-#define CHECKPOINT_FORCE 0x0004 /* Force even if no activity */
+#define CHECKPOINT_END_OF_RECOVERY 0x0002 /* Like shutdown checkpoint, but
+ * issued at end of WAL recovery */
+#define CHECKPOINT_IMMEDIATE 0x0004 /* Do it without delays */
+#define CHECKPOINT_FORCE 0x0008 /* Force even if no activity */
/* These are important to RequestCheckpoint */
-#define CHECKPOINT_WAIT 0x0008 /* Wait for completion */
+#define CHECKPOINT_WAIT 0x0010 /* Wait for completion */
/* These indicate the cause of a checkpoint request */
-#define CHECKPOINT_CAUSE_XLOG 0x0010 /* XLOG consumption */
-#define CHECKPOINT_CAUSE_TIME 0x0020 /* Elapsed time */
-#define CHECKPOINT_END_OF_RECOVERY 0x0040 /* Like shutdown checkpoint, but
- * issued at end of WAL recovery */
+#define CHECKPOINT_CAUSE_XLOG 0x0020 /* XLOG consumption */
+#define CHECKPOINT_CAUSE_TIME 0x0040 /* Elapsed time */
/* Checkpoint statistics */
typedef struct CheckpointStatsData
@@ -202,6 +202,7 @@ extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
extern bool RecoveryInProgress(void);
+extern bool XLogInsertAllowed(void);
extern void UpdateControlFile(void);
extern Size XLOGShmemSize(void);