diff options
author | Robert Haas <rhaas@postgresql.org> | 2022-03-24 14:32:24 -0400 |
---|---|---|
committer | Robert Haas <rhaas@postgresql.org> | 2022-03-24 14:32:48 -0400 |
commit | bbace5697df12398e87ffd9879171c39d27f5b33 (patch) | |
tree | 7d076a8cda05aa7c4c1c6eb2b505e2fa279f6679 /src/backend | |
parent | 81045e1e1c3370fb7e57c8841b0a7b6aab31831b (diff) | |
download | postgresql-bbace5697df12398e87ffd9879171c39d27f5b33.tar.gz postgresql-bbace5697df12398e87ffd9879171c39d27f5b33.zip |
Fix possible recovery trouble if TRUNCATE overlaps a checkpoint.
If TRUNCATE causes some buffers to be invalidated and thus the
checkpoint does not flush them, TRUNCATE must also ensure that the
corresponding files are truncated on disk. Otherwise, a replay
from the checkpoint might find that the buffers exist but have
the wrong contents, which may cause replay to fail.
Report by Teja Mupparti. Patch by Kyotaro Horiguchi, per a design
suggestion from Heikki Linnakangas, with some changes to the
comments by me. Review of this and a prior patch that approached
the issue differently by Heikki Linnakangas, Andres Freund, Álvaro
Herrera, Masahiko Sawada, and Tom Lane.
Discussion: http://postgr.es/m/BYAPR06MB6373BF50B469CA393C614257ABF00@BYAPR06MB6373.namprd06.prod.outlook.com
Diffstat (limited to 'src/backend')
-rw-r--r-- | src/backend/access/transam/multixact.c | 6 | ||||
-rw-r--r-- | src/backend/access/transam/twophase.c | 12 | ||||
-rw-r--r-- | src/backend/access/transam/xact.c | 5 | ||||
-rw-r--r-- | src/backend/access/transam/xlog.c | 16 | ||||
-rw-r--r-- | src/backend/access/transam/xloginsert.c | 2 | ||||
-rw-r--r-- | src/backend/catalog/storage.c | 29 | ||||
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 6 | ||||
-rw-r--r-- | src/backend/storage/ipc/procarray.c | 26 | ||||
-rw-r--r-- | src/backend/storage/lmgr/proc.c | 4 |
9 files changed, 81 insertions, 25 deletions
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index b643564f16a..50d8bab9e21 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -3075,8 +3075,8 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) * crash/basebackup, even though the state of the data directory would * require it. */ - Assert(!MyProc->delayChkpt); - MyProc->delayChkpt = true; + Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0); + MyProc->delayChkpt |= DELAY_CHKPT_START; /* WAL log truncation */ WriteMTruncateXlogRec(newOldestMultiDB, @@ -3102,7 +3102,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) /* Then offsets */ PerformOffsetsTruncation(oldestMulti, newOldestMulti); - MyProc->delayChkpt = false; + MyProc->delayChkpt &= ~DELAY_CHKPT_START; END_CRIT_SECTION(); LWLockRelease(MultiXactTruncationLock); diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 7cc76c1db73..dea3f485f7a 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -474,7 +474,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, } proc->xid = xid; Assert(proc->xmin == InvalidTransactionId); - proc->delayChkpt = false; + proc->delayChkpt = 0; proc->statusFlags = 0; proc->pid = 0; proc->databaseId = databaseid; @@ -1165,7 +1165,8 @@ EndPrepare(GlobalTransaction gxact) START_CRIT_SECTION(); - MyProc->delayChkpt = true; + Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0); + MyProc->delayChkpt |= DELAY_CHKPT_START; XLogBeginInsert(); for (record = records.head; record != NULL; record = record->next) @@ -1208,7 +1209,7 @@ EndPrepare(GlobalTransaction gxact) * checkpoint starting after this will certainly see the gxact as a * candidate for fsyncing. */ - MyProc->delayChkpt = false; + MyProc->delayChkpt &= ~DELAY_CHKPT_START; /* * Remember that we have this GlobalTransaction entry locked for us. If @@ -2275,7 +2276,8 @@ RecordTransactionCommitPrepared(TransactionId xid, START_CRIT_SECTION(); /* See notes in RecordTransactionCommit */ - MyProc->delayChkpt = true; + Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0); + MyProc->delayChkpt |= DELAY_CHKPT_START; /* * Emit the XLOG commit record. Note that we mark 2PC commits as @@ -2323,7 +2325,7 @@ RecordTransactionCommitPrepared(TransactionId xid, TransactionIdCommitTree(xid, nchildren, children); /* Checkpoint can proceed now */ - MyProc->delayChkpt = false; + MyProc->delayChkpt &= ~DELAY_CHKPT_START; END_CRIT_SECTION(); diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 514044f3db7..c5e7261921d 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -1335,8 +1335,9 @@ RecordTransactionCommit(void) * This makes checkpoint's determination of which xacts are delayChkpt * a bit fuzzy, but it doesn't matter. */ + Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0); START_CRIT_SECTION(); - MyProc->delayChkpt = true; + MyProc->delayChkpt |= DELAY_CHKPT_START; SetCurrentTransactionStopTimestamp(); @@ -1437,7 +1438,7 @@ RecordTransactionCommit(void) */ if (markXidCommitted) { - MyProc->delayChkpt = false; + MyProc->delayChkpt &= ~DELAY_CHKPT_START; END_CRIT_SECTION(); } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index b89c58118e9..d889d69387e 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -9228,18 +9228,30 @@ CreateCheckPoint(int flags) * and we will correctly flush the update below. So we cannot miss any * xacts we need to wait for. */ - vxids = GetVirtualXIDsDelayingChkpt(&nvxids); + vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START); if (nvxids > 0) { do { pg_usleep(10000L); /* wait for 10 msec */ - } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids)); + } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids, + DELAY_CHKPT_START)); } pfree(vxids); CheckPointGuts(checkPoint.redo, flags); + vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE); + if (nvxids > 0) + { + do + { + pg_usleep(10000L); /* wait for 10 msec */ + } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids, + DELAY_CHKPT_COMPLETE)); + } + pfree(vxids); + /* * Take a snapshot of running transactions and write this to WAL. This * allows us to reconstruct the state of running transactions during diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index b153fad594d..1af4a90c419 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -925,7 +925,7 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) /* * Ensure no checkpoint can change our view of RedoRecPtr. */ - Assert(MyProc->delayChkpt); + Assert((MyProc->delayChkpt & DELAY_CHKPT_START) != 0); /* * Update RedoRecPtr so that we can make the right decision diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index cba7a9ada07..fa5682dce80 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -326,6 +326,22 @@ RelationTruncate(Relation rel, BlockNumber nblocks) RelationPreTruncate(rel); /* + * Make sure that a concurrent checkpoint can't complete while truncation + * is in progress. + * + * The truncation operation might drop buffers that the checkpoint + * otherwise would have flushed. If it does, then it's essential that + * the files actually get truncated on disk before the checkpoint record + * is written. Otherwise, if reply begins from that checkpoint, the + * to-be-truncated blocks might still exist on disk but have older + * contents than expected, which can cause replay to fail. It's OK for + * the blocks to not exist on disk at all, but not for them to have the + * wrong contents. + */ + Assert((MyProc->delayChkpt & DELAY_CHKPT_COMPLETE) == 0); + MyProc->delayChkpt |= DELAY_CHKPT_COMPLETE; + + /* * We WAL-log the truncation before actually truncating, which means * trouble if the truncation fails. If we then crash, the WAL replay * likely isn't going to succeed in the truncation either, and cause a @@ -363,13 +379,24 @@ RelationTruncate(Relation rel, BlockNumber nblocks) XLogFlush(lsn); } - /* Do the real work to truncate relation forks */ + /* + * This will first remove any buffers from the buffer pool that should no + * longer exist after truncation is complete, and then truncate the + * corresponding files on disk. + */ smgrtruncate(rel->rd_smgr, forks, nforks, blocks); + /* We've done all the critical work, so checkpoints are OK now. */ + MyProc->delayChkpt &= ~DELAY_CHKPT_COMPLETE; + /* * Update upper-level FSM pages to account for the truncation. This is * important because the just-truncated pages were likely marked as * all-free, and would be preferentially selected. + * + * NB: There's no point in delaying checkpoints until this is done. + * Because the FSM is not WAL-logged, we have to be prepared for the + * possibility of corruption after a crash anyway. */ if (need_fsm_vacuum) FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index ffc6056c60c..a55545a1875 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3946,7 +3946,9 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) * essential that CreateCheckpoint waits for virtual transactions * rather than full transactionids. */ - MyProc->delayChkpt = delayChkpt = true; + Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0); + MyProc->delayChkpt |= DELAY_CHKPT_START; + delayChkpt = true; lsn = XLogSaveBufferForHint(buffer, buffer_std); } @@ -3979,7 +3981,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) UnlockBufHdr(bufHdr, buf_state); if (delayChkpt) - MyProc->delayChkpt = false; + MyProc->delayChkpt &= ~DELAY_CHKPT_START; if (dirtied) { diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index f047f9a242c..ae71d7538be 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -689,7 +689,10 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) proc->lxid = InvalidLocalTransactionId; proc->xmin = InvalidTransactionId; - proc->delayChkpt = false; /* be sure this is cleared in abort */ + + /* be sure this is cleared in abort */ + proc->delayChkpt = 0; + proc->recoveryConflictPending = false; /* must be cleared with xid/xmin: */ @@ -728,7 +731,10 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) proc->xid = InvalidTransactionId; proc->lxid = InvalidLocalTransactionId; proc->xmin = InvalidTransactionId; - proc->delayChkpt = false; /* be sure this is cleared in abort */ + + /* be sure this is cleared in abort */ + proc->delayChkpt = 0; + proc->recoveryConflictPending = false; /* must be cleared with xid/xmin: */ @@ -3043,7 +3049,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly) * delaying checkpoint because they have critical actions in progress. * * Constructs an array of VXIDs of transactions that are currently in commit - * critical sections, as shown by having delayChkpt set in their PGPROC. + * critical sections, as shown by having specified delayChkpt bits set in their + * PGPROC. * * Returns a palloc'd array that should be freed by the caller. * *nvxids is the number of valid entries. @@ -3057,13 +3064,15 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly) * for clearing of delayChkpt to propagate is unimportant for correctness. */ VirtualTransactionId * -GetVirtualXIDsDelayingChkpt(int *nvxids) +GetVirtualXIDsDelayingChkpt(int *nvxids, int type) { VirtualTransactionId *vxids; ProcArrayStruct *arrayP = procArray; int count = 0; int index; + Assert(type != 0); + /* allocate what's certainly enough result space */ vxids = (VirtualTransactionId *) palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs); @@ -3075,7 +3084,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids) int pgprocno = arrayP->pgprocnos[index]; PGPROC *proc = &allProcs[pgprocno]; - if (proc->delayChkpt) + if ((proc->delayChkpt & type) != 0) { VirtualTransactionId vxid; @@ -3101,12 +3110,14 @@ GetVirtualXIDsDelayingChkpt(int *nvxids) * those numbers should be small enough for it not to be a problem. */ bool -HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids) +HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type) { bool result = false; ProcArrayStruct *arrayP = procArray; int index; + Assert(type != 0); + LWLockAcquire(ProcArrayLock, LW_SHARED); for (index = 0; index < arrayP->numProcs; index++) @@ -3117,7 +3128,8 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids) GET_VXID_FROM_PGPROC(vxid, *proc); - if (proc->delayChkpt && VirtualTransactionIdIsValid(vxid)) + if ((proc->delayChkpt & type) != 0 && + VirtualTransactionIdIsValid(vxid)) { int i; diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 2575ea1ca0d..c50a419a546 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -394,7 +394,7 @@ InitProcess(void) MyProc->roleId = InvalidOid; MyProc->tempNamespaceId = InvalidOid; MyProc->isBackgroundWorker = IsBackgroundWorker; - MyProc->delayChkpt = false; + MyProc->delayChkpt = 0; MyProc->statusFlags = 0; /* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */ if (IsAutoVacuumWorkerProcess()) @@ -579,7 +579,7 @@ InitAuxiliaryProcess(void) MyProc->roleId = InvalidOid; MyProc->tempNamespaceId = InvalidOid; MyProc->isBackgroundWorker = IsBackgroundWorker; - MyProc->delayChkpt = false; + MyProc->delayChkpt = 0; MyProc->statusFlags = 0; MyProc->lwWaiting = false; MyProc->lwWaitMode = 0; |