diff options
Diffstat (limited to 'src/backend')
-rw-r--r-- | src/backend/access/gin/ginxlog.c | 12 | ||||
-rw-r--r-- | src/backend/access/gist/gistxlog.c | 10 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtxlog.c | 10 | ||||
-rw-r--r-- | src/backend/access/transam/rmgr.c | 34 | ||||
-rw-r--r-- | src/backend/access/transam/xlog.c | 205 |
5 files changed, 194 insertions, 77 deletions
diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index a3387422418..265e7de70c4 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.3 2006/07/14 14:52:16 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/gin/ginxlog.c,v 1.4 2006/08/07 16:57:56 tgl Exp $ *------------------------------------------------------------------------- */ #include "postgres.h" @@ -528,7 +528,8 @@ gin_xlog_cleanup(void) { topCtx = MemoryContextSwitchTo(opCtx); - foreach(l, incomplete_splits) { + foreach(l, incomplete_splits) + { ginIncompleteSplit *split = (ginIncompleteSplit *) lfirst(l); ginContinueSplit( split ); MemoryContextReset( opCtx ); @@ -538,3 +539,10 @@ gin_xlog_cleanup(void) { MemoryContextDelete(opCtx); } +bool +gin_safe_restartpoint(void) +{ + if (incomplete_splits) + return false; + return true; +} diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 9694e32384e..9680ed96192 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gist/gistxlog.c,v 1.22 2006/07/14 14:52:16 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/gist/gistxlog.c,v 1.23 2006/08/07 16:57:56 tgl Exp $ *------------------------------------------------------------------------- */ #include "postgres.h" @@ -818,6 +818,14 @@ gist_xlog_cleanup(void) MemoryContextDelete(insertCtx); } +bool +gist_safe_restartpoint(void) +{ + if (incomplete_inserts) + return false; + return true; +} + XLogRecData * formSplitRdata(RelFileNode node, BlockNumber blkno, bool page_is_leaf, diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 126f593cd96..88962ccfcec 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.36 2006/07/25 19:13:00 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.37 2006/08/07 16:57:56 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -794,3 +794,11 @@ btree_xlog_cleanup(void) } incomplete_splits = NIL; } + +bool +btree_safe_restartpoint(void) +{ + if (incomplete_splits) + return false; + return true; +} diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 331816f5e6d..7db05c9c482 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -3,7 +3,7 @@ * * Resource managers definition * - * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.23 2006/07/11 17:26:58 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.24 2006/08/07 16:57:56 tgl Exp $ */ #include "postgres.h" @@ -23,20 +23,20 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = { - {"XLOG", xlog_redo, xlog_desc, NULL, NULL}, - {"Transaction", xact_redo, xact_desc, NULL, NULL}, - {"Storage", smgr_redo, smgr_desc, NULL, NULL}, - {"CLOG", clog_redo, clog_desc, NULL, NULL}, - {"Database", dbase_redo, dbase_desc, NULL, NULL}, - {"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL}, - {"MultiXact", multixact_redo, multixact_desc, NULL, NULL}, - {"Reserved 7", NULL, NULL, NULL, NULL}, - {"Reserved 8", NULL, NULL, NULL, NULL}, - {"Reserved 9", NULL, NULL, NULL, NULL}, - {"Heap", heap_redo, heap_desc, NULL, NULL}, - {"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup}, - {"Hash", hash_redo, hash_desc, NULL, NULL}, - {"Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup}, - {"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup}, - {"Sequence", seq_redo, seq_desc, NULL, NULL} + {"XLOG", xlog_redo, xlog_desc, NULL, NULL, NULL}, + {"Transaction", xact_redo, xact_desc, NULL, NULL, NULL}, + {"Storage", smgr_redo, smgr_desc, NULL, NULL, NULL}, + {"CLOG", clog_redo, clog_desc, NULL, NULL, NULL}, + {"Database", dbase_redo, dbase_desc, NULL, NULL, NULL}, + {"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL}, + {"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL}, + {"Reserved 7", NULL, NULL, NULL, NULL, NULL}, + {"Reserved 8", NULL, NULL, NULL, NULL, NULL}, + {"Reserved 9", NULL, NULL, NULL, NULL, NULL}, + {"Heap", heap_redo, heap_desc, NULL, NULL, NULL}, + {"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint}, + {"Hash", hash_redo, hash_desc, NULL, NULL, NULL}, + {"Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, gin_safe_restartpoint}, + {"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, gist_safe_restartpoint}, + {"Sequence", seq_redo, seq_desc, NULL, NULL, NULL} }; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index de8e682d849..21e7b47206d 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.246 2006/08/06 03:53:44 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.247 2006/08/07 16:57:56 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -120,6 +120,7 @@ /* File path names (all relative to $PGDATA) */ #define BACKUP_LABEL_FILE "backup_label" +#define BACKUP_LABEL_OLD "backup_label.old" #define RECOVERY_COMMAND_FILE "recovery.conf" #define RECOVERY_COMMAND_DONE "recovery.done" @@ -185,9 +186,6 @@ static TransactionId recoveryStopXid; static time_t recoveryStopTime; static bool recoveryStopAfter; -/* constraint set by read_backup_label */ -static XLogRecPtr recoveryMinXlogOffset = {0, 0}; - /* * During normal operation, the only timeline we care about is ThisTimeLineID. * During recovery, however, things are more complicated. To simplify life @@ -463,6 +461,7 @@ static void readRecoveryCommandFile(void); static void exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg); static bool recoveryStopsHere(XLogRecord *record, bool *includeThis); +static void CheckPointGuts(XLogRecPtr checkPointRedo); static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites, XLogRecPtr *lsn, BkpBlock *bkpb); @@ -499,8 +498,8 @@ static void issue_xlog_fsync(void); #ifdef WAL_DEBUG static void xlog_outrec(StringInfo buf, XLogRecord *record); #endif -static bool read_backup_label(XLogRecPtr *checkPointLoc); -static void remove_backup_label(void); +static bool read_backup_label(XLogRecPtr *checkPointLoc, + XLogRecPtr *minRecoveryLoc); static void rm_redo_error_callback(void *arg); @@ -4594,9 +4593,11 @@ StartupXLOG(void) CheckPoint checkPoint; bool wasShutdown; bool needNewTimeLine = false; + bool haveBackupLabel = false; XLogRecPtr RecPtr, LastRec, checkPointLoc, + minRecoveryLoc, EndOfLog; uint32 endLogId; uint32 endLogSeg; @@ -4629,12 +4630,18 @@ StartupXLOG(void) ereport(LOG, (errmsg("database system shutdown was interrupted at %s", str_time(ControlFile->time)))); - else if (ControlFile->state == DB_IN_RECOVERY) + else if (ControlFile->state == DB_IN_CRASH_RECOVERY) ereport(LOG, (errmsg("database system was interrupted while in recovery at %s", str_time(ControlFile->time)), errhint("This probably means that some data is corrupted and" " you will have to use the last backup for recovery."))); + else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY) + ereport(LOG, + (errmsg("database system was interrupted while in recovery at log time %s", + str_time(ControlFile->checkPointCopy.time)), + errhint("If this has occurred more than once some data may be corrupted" + " and you may need to choose an earlier recovery target."))); else if (ControlFile->state == DB_IN_PRODUCTION) ereport(LOG, (errmsg("database system was interrupted at %s", @@ -4673,7 +4680,7 @@ StartupXLOG(void) recoveryTargetTLI, ControlFile->checkPointCopy.ThisTimeLineID))); - if (read_backup_label(&checkPointLoc)) + if (read_backup_label(&checkPointLoc, &minRecoveryLoc)) { /* * When a backup_label file is present, we want to roll forward from @@ -4693,6 +4700,8 @@ StartupXLOG(void) (errmsg("could not locate required checkpoint record"), errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir))); } + /* set flag to delete it later */ + haveBackupLabel = true; } else { @@ -4790,17 +4799,51 @@ StartupXLOG(void) { int rmid; + /* + * Update pg_control to show that we are recovering and to show + * the selected checkpoint as the place we are starting from. + * We also mark pg_control with any minimum recovery stop point + * obtained from a backup history file. + */ if (InArchiveRecovery) + { ereport(LOG, (errmsg("automatic recovery in progress"))); + ControlFile->state = DB_IN_ARCHIVE_RECOVERY; + } else + { ereport(LOG, (errmsg("database system was not properly shut down; " "automatic recovery in progress"))); - ControlFile->state = DB_IN_RECOVERY; + ControlFile->state = DB_IN_CRASH_RECOVERY; + } + ControlFile->prevCheckPoint = ControlFile->checkPoint; + ControlFile->checkPoint = checkPointLoc; + ControlFile->checkPointCopy = checkPoint; + if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0) + ControlFile->minRecoveryPoint = minRecoveryLoc; ControlFile->time = time(NULL); UpdateControlFile(); + /* + * If there was a backup label file, it's done its job and the + * info has now been propagated into pg_control. We must get rid of + * the label file so that if we crash during recovery, we'll pick up + * at the latest recovery restartpoint instead of going all the way + * back to the backup start point. It seems prudent though to just + * rename the file out of the way rather than delete it completely. + */ + if (haveBackupLabel) + { + unlink(BACKUP_LABEL_OLD); + if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0) + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + BACKUP_LABEL_FILE, BACKUP_LABEL_OLD))); + } + /* Start up the recovery environment */ XLogInitRelationCache(); @@ -4927,7 +4970,7 @@ StartupXLOG(void) * Complain if we did not roll forward far enough to render the backup * dump consistent. */ - if (XLByteLT(EndOfLog, recoveryMinXlogOffset)) + if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint)) { if (needNewTimeLine) /* stopped because of stop request */ ereport(FATAL, @@ -5051,34 +5094,20 @@ StartupXLOG(void) pgstat_reset_all(); /* - * Perform a new checkpoint to update our recovery activity to disk. + * Perform a checkpoint to update all our recovery activity to disk. * * Note that we write a shutdown checkpoint rather than an on-line * one. This is not particularly critical, but since we may be * assigning a new TLI, using a shutdown checkpoint allows us to have * the rule that TLI only changes in shutdown checkpoints, which * allows some extra error checking in xlog_redo. - * - * In case we had to use the secondary checkpoint, make sure that it - * will still be shown as the secondary checkpoint after this - * CreateCheckPoint operation; we don't want the broken primary - * checkpoint to become prevCheckPoint... */ - if (XLByteEQ(checkPointLoc, ControlFile->prevCheckPoint)) - ControlFile->checkPoint = checkPointLoc; - CreateCheckPoint(true, true); /* * Close down recovery environment */ XLogCloseRelationCache(); - - /* - * Now that we've checkpointed the recovery, it's safe to flush old - * backup_label, if present. - */ - remove_backup_label(); } /* @@ -5464,6 +5493,10 @@ CreateCheckPoint(bool shutdown, bool force) LWLockRelease(CheckpointStartLock); + if (!shutdown) + ereport(DEBUG2, + (errmsg("checkpoint starting"))); + /* * Get the other info we need for the checkpoint record. */ @@ -5494,16 +5527,7 @@ CreateCheckPoint(bool shutdown, bool force) */ END_CRIT_SECTION(); - if (!shutdown) - ereport(DEBUG2, - (errmsg("checkpoint starting"))); - - CheckPointCLOG(); - CheckPointSUBTRANS(); - CheckPointMultiXact(); - FlushBufferPool(); - /* We deliberately delay 2PC checkpointing as long as possible */ - CheckPointTwoPhase(checkPoint.redo); + CheckPointGuts(checkPoint.redo); START_CRIT_SECTION(); @@ -5592,6 +5616,85 @@ CreateCheckPoint(bool shutdown, bool force) } /* + * Flush all data in shared memory to disk, and fsync + * + * This is the common code shared between regular checkpoints and + * recovery restartpoints. + */ +static void +CheckPointGuts(XLogRecPtr checkPointRedo) +{ + CheckPointCLOG(); + CheckPointSUBTRANS(); + CheckPointMultiXact(); + FlushBufferPool(); /* performs all required fsyncs */ + /* We deliberately delay 2PC checkpointing as long as possible */ + CheckPointTwoPhase(checkPointRedo); +} + +/* + * Set a recovery restart point if appropriate + * + * This is similar to CreateCheckpoint, but is used during WAL recovery + * to establish a point from which recovery can roll forward without + * replaying the entire recovery log. This function is called each time + * a checkpoint record is read from XLOG; it must determine whether a + * restartpoint is needed or not. + */ +static void +RecoveryRestartPoint(const CheckPoint *checkPoint) +{ + int elapsed_secs; + int rmid; + + /* + * Do nothing if the elapsed time since the last restartpoint is less + * than half of checkpoint_timeout. (We use a value less than + * checkpoint_timeout so that variations in the timing of checkpoints on + * the master, or speed of transmission of WAL segments to a slave, won't + * make the slave skip a restartpoint once it's synced with the master.) + * Checking true elapsed time keeps us from doing restartpoints too often + * while rapidly scanning large amounts of WAL. + */ + elapsed_secs = time(NULL) - ControlFile->time; + if (elapsed_secs < CheckPointTimeout / 2) + return; + + /* + * Is it safe to checkpoint? We must ask each of the resource managers + * whether they have any partial state information that might prevent a + * correct restart from this point. If so, we skip this opportunity, but + * return at the next checkpoint record for another try. + */ + for (rmid = 0; rmid <= RM_MAX_ID; rmid++) + { + if (RmgrTable[rmid].rm_safe_restartpoint != NULL) + if (!(RmgrTable[rmid].rm_safe_restartpoint())) + return; + } + + /* + * OK, force data out to disk + */ + CheckPointGuts(checkPoint->redo); + + /* + * Update pg_control so that any subsequent crash will restart from + * this checkpoint. Note: ReadRecPtr gives the XLOG address of the + * checkpoint record itself. + */ + ControlFile->prevCheckPoint = ControlFile->checkPoint; + ControlFile->checkPoint = ReadRecPtr; + ControlFile->checkPointCopy = *checkPoint; + ControlFile->time = time(NULL); + UpdateControlFile(); + + ereport(DEBUG2, + (errmsg("recovery restart point at %X/%X", + checkPoint->redo.xlogid, checkPoint->redo.xrecoff))); +} + +/* * Write a NEXTOID log record */ void @@ -5687,6 +5790,8 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) /* Following WAL records should be run with new TLI */ ThisTimeLineID = checkPoint.ThisTimeLineID; } + + RecoveryRestartPoint(&checkPoint); } else if (info == XLOG_CHECKPOINT_ONLINE) { @@ -5709,6 +5814,8 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) ereport(PANIC, (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record", checkPoint.ThisTimeLineID, ThisTimeLineID))); + + RecoveryRestartPoint(&checkPoint); } else if (info == XLOG_SWITCH) { @@ -6349,14 +6456,14 @@ pg_xlogfile_name(PG_FUNCTION_ARGS) * point, we will fail to restore a consistent database state. * * We also attempt to retrieve the corresponding backup history file. - * If successful, set recoveryMinXlogOffset to constrain valid PITR stopping + * If successful, set *minRecoveryLoc to constrain valid PITR stopping * points. * * Returns TRUE if a backup_label was found (and fills the checkpoint * location into *checkPointLoc); returns FALSE if not. */ static bool -read_backup_label(XLogRecPtr *checkPointLoc) +read_backup_label(XLogRecPtr *checkPointLoc, XLogRecPtr *minRecoveryLoc) { XLogRecPtr startpoint; XLogRecPtr stoppoint; @@ -6371,6 +6478,10 @@ read_backup_label(XLogRecPtr *checkPointLoc) FILE *fp; char ch; + /* Default is to not constrain recovery stop point */ + minRecoveryLoc->xlogid = 0; + minRecoveryLoc->xrecoff = 0; + /* * See if label file is present */ @@ -6439,7 +6550,7 @@ read_backup_label(XLogRecPtr *checkPointLoc) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("invalid data in file \"%s\"", histfilename))); - recoveryMinXlogOffset = stoppoint; + *minRecoveryLoc = stoppoint; if (ferror(fp) || FreeFile(fp)) ereport(FATAL, (errcode_for_file_access(), @@ -6451,24 +6562,6 @@ read_backup_label(XLogRecPtr *checkPointLoc) } /* - * remove_backup_label: remove any extant backup_label after successful - * recovery. Once we have completed the end-of-recovery checkpoint there - * is no reason to have to replay from the start point indicated by the - * label (and indeed we'll probably have removed/recycled the needed WAL - * segments), so remove the label to prevent trouble in later crash recoveries. - */ -static void -remove_backup_label(void) -{ - if (unlink(BACKUP_LABEL_FILE) != 0) - if (errno != ENOENT) - ereport(FATAL, - (errcode_for_file_access(), - errmsg("could not remove file \"%s\": %m", - BACKUP_LABEL_FILE))); -} - -/* * Error context callback for errors occurring during rm_redo(). */ static void |