diff options
Diffstat (limited to 'src/backend')
-rw-r--r-- | src/backend/catalog/storage.c | 44 | ||||
-rw-r--r-- | src/backend/storage/smgr/md.c | 28 | ||||
-rw-r--r-- | src/backend/storage/smgr/smgr.c | 14 |
3 files changed, 62 insertions, 24 deletions
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index b8baa3b822d..0ac91be63dc 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -280,6 +280,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks) bool vm; bool need_fsm_vacuum = false; ForkNumber forks[MAX_FORKNUM]; + BlockNumber old_blocks[MAX_FORKNUM]; BlockNumber blocks[MAX_FORKNUM]; int nforks = 0; SMgrRelation reln; @@ -295,6 +296,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks) /* Prepare for truncation of MAIN fork of the relation */ forks[nforks] = MAIN_FORKNUM; + old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM); blocks[nforks] = nblocks; nforks++; @@ -306,6 +308,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks) if (BlockNumberIsValid(blocks[nforks])) { forks[nforks] = FSM_FORKNUM; + old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM); nforks++; need_fsm_vacuum = true; } @@ -319,6 +322,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks) if (BlockNumberIsValid(blocks[nforks])) { forks[nforks] = VISIBILITYMAP_FORKNUM; + old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM); nforks++; } } @@ -357,14 +361,20 @@ RelationTruncate(Relation rel, BlockNumber nblocks) MyProc->delayChkptEnd = true; /* DELAY_CHKPT_COMPLETE */ /* - * We WAL-log the truncation before actually truncating, which means - * trouble if the truncation fails. If we then crash, the WAL replay - * likely isn't going to succeed in the truncation either, and cause a - * PANIC. It's tempting to put a critical section here, but that cure - * would be worse than the disease. It would turn a usually harmless - * failure to truncate, that might spell trouble at WAL replay, into a - * certain PANIC. + * We WAL-log the truncation first and then truncate in a critical + * section. Truncation drops buffers, even if dirty, and then truncates + * disk files. All of that work needs to complete before the lock is + * released, or else old versions of pages on disk that are missing recent + * changes would become accessible again. We'll try the whole operation + * again in crash recovery if we panic, but even then we can't give up + * because we don't want standbys' relation sizes to diverge and break + * replay or visibility invariants downstream. The critical section also + * suppresses interrupts. + * + * (See also pg_visibilitymap.c if changing this code.) */ + START_CRIT_SECTION(); + if (RelationNeedsWAL(rel)) { /* @@ -388,10 +398,10 @@ RelationTruncate(Relation rel, BlockNumber nblocks) * hit the disk before the WAL record, and the truncation of the FSM * or visibility map. If we crashed during that window, we'd be left * with a truncated heap, but the FSM or visibility map would still - * contain entries for the non-existent heap pages. + * contain entries for the non-existent heap pages, and standbys would + * also never replay the truncation. */ - if (fsm || vm) - XLogFlush(lsn); + XLogFlush(lsn); } /* @@ -399,7 +409,9 @@ RelationTruncate(Relation rel, BlockNumber nblocks) * longer exist after truncation is complete, and then truncate the * corresponding files on disk. */ - smgrtruncate(RelationGetSmgr(rel), forks, nforks, blocks); + smgrtruncate(RelationGetSmgr(rel), forks, nforks, old_blocks, blocks); + + END_CRIT_SECTION(); /* We've done all the critical work, so checkpoints are OK now. */ MyProc->delayChkpt = false; @@ -983,6 +995,7 @@ smgr_redo(XLogReaderState *record) Relation rel; ForkNumber forks[MAX_FORKNUM]; BlockNumber blocks[MAX_FORKNUM]; + BlockNumber old_blocks[MAX_FORKNUM]; int nforks = 0; bool need_fsm_vacuum = false; @@ -1017,6 +1030,7 @@ smgr_redo(XLogReaderState *record) if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0) { forks[nforks] = MAIN_FORKNUM; + old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM); blocks[nforks] = xlrec->blkno; nforks++; @@ -1034,6 +1048,7 @@ smgr_redo(XLogReaderState *record) if (BlockNumberIsValid(blocks[nforks])) { forks[nforks] = FSM_FORKNUM; + old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM); nforks++; need_fsm_vacuum = true; } @@ -1045,13 +1060,18 @@ smgr_redo(XLogReaderState *record) if (BlockNumberIsValid(blocks[nforks])) { forks[nforks] = VISIBILITYMAP_FORKNUM; + old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM); nforks++; } } /* Do the real work to truncate relation forks */ if (nforks > 0) - smgrtruncate(reln, forks, nforks, blocks); + { + START_CRIT_SECTION(); + smgrtruncate(reln, forks, nforks, old_blocks, blocks); + END_CRIT_SECTION(); + } /* * Update upper-level FSM pages to account for the truncation. This is diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 72f1494c7ab..b48a6d49681 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -818,19 +818,21 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) /* * mdtruncate() -- Truncate relation to specified number of blocks. + * + * Guaranteed not to allocate memory, so it can be used in a critical section. + * Caller must have called smgrnblocks() to obtain curnblk while holding a + * sufficient lock to prevent a change in relation size, and not used any smgr + * functions for this relation or handled interrupts in between. This makes + * sure we have opened all active segments, so that truncate loop will get + * them all! */ void -mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +mdtruncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber curnblk, BlockNumber nblocks) { - BlockNumber curnblk; BlockNumber priorblocks; int curopensegs; - /* - * NOTE: mdnblocks makes sure we have opened all active segments, so that - * truncation loop will get them all! - */ - curnblk = mdnblocks(reln, forknum); if (nblocks > curnblk) { /* Bogus request ... but no complaint if InRecovery */ @@ -1099,7 +1101,7 @@ _fdvec_resize(SMgrRelation reln, reln->md_seg_fds[forknum] = MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg); } - else + else if (nseg > reln->md_num_open_segs[forknum]) { /* * It doesn't seem worthwhile complicating the code to amortize @@ -1111,6 +1113,16 @@ _fdvec_resize(SMgrRelation reln, repalloc(reln->md_seg_fds[forknum], sizeof(MdfdVec) * nseg); } + else + { + /* + * We don't reallocate a smaller array, because we want mdtruncate() + * to be able to promise that it won't allocate memory, so that it is + * allowed in a critical section. This means that a bit of space in + * the array is now wasted, until the next time we add a segment and + * reallocate. + */ + } reln->md_num_open_segs[forknum] = nseg; } diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 1d3bb92dfe1..9e6500ff981 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -60,7 +60,7 @@ typedef struct f_smgr BlockNumber blocknum, BlockNumber nblocks); BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); + BlockNumber old_blocks, BlockNumber nblocks); void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); } f_smgr; @@ -590,10 +590,15 @@ smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum) * * The caller must hold AccessExclusiveLock on the relation, to ensure that * other backends receive the smgr invalidation event that this function sends - * before they access any forks of the relation again. + * before they access any forks of the relation again. The current size of + * the forks should be provided in old_nblocks. This function should normally + * be called in a critical section, but the current size must be checked + * outside the critical section, and no interrupts or smgr functions relating + * to this relation should be called in between. */ void -smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nblocks) +smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, + BlockNumber *old_nblocks, BlockNumber *nblocks) { int i; @@ -621,7 +626,8 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb /* Make the cached size is invalid if we encounter an error. */ reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber; - smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]); + smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], + old_nblocks[i], nblocks[i]); /* * We might as well update the local smgr_cached_nblocks values. The |