From 8fc23a9ed0a040d039431ef79b1bf166395ed180 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 13 Nov 2014 19:47:44 +0200 Subject: Fix race condition between hot standby and restoring a full-page image. There was a window in RestoreBackupBlock where a page would be zeroed out, but not yet locked. If a backend pinned and locked the page in that window, it saw the zeroed page instead of the old page or new page contents, which could lead to missing rows in a result set, or errors. To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins, zeroes, and locks the page, if it's not in the buffer cache already. In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE, to avoid breaking any 3rd party extensions that might use RBM_ZERO. More importantly, this avoids renumbering the other enum values, which would cause even bigger confusion in extensions that use ReadBufferExtended, but haven't been recompiled. Backpatch to all supported versions; this has been racy since hot standby was introduced. --- src/backend/storage/buffer/bufmgr.c | 41 +++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) (limited to 'src/backend/storage/buffer/bufmgr.c') diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index f5da54472c1..74ef1cbc629 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -213,14 +213,19 @@ ReadBuffer(Relation reln, BlockNumber blockNum) * valid, the page is zeroed instead of throwing an error. This is intended * for non-critical data, where the caller is prepared to repair errors. * - * In RBM_ZERO mode, if the page isn't in buffer cache already, it's filled - * with zeros instead of reading it from disk. Useful when the caller is - * going to fill the page from scratch, since this saves I/O and avoids + * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's + * filled with zeros instead of reading it from disk. Useful when the caller + * is going to fill the page from scratch, since this saves I/O and avoids * unnecessary failure if the page-on-disk has corrupt page headers. + * The page is returned locked to ensure that the caller has a chance to + * initialize the page before it's made visible to others. * Caution: do not use this mode to read a page that is beyond the relation's * current physical EOF; that is likely to cause problems in md.c when * the page is modified and written out. P_NEW is OK, though. * + * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires + * a cleanup-strength lock on the page. + * * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here. * * If strategy is not NULL, a nondefault buffer access strategy is used. @@ -362,6 +367,18 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, isExtend, found); + /* + * In RBM_ZERO_AND_LOCK mode, the caller expects the buffer to + * be already locked on return. + */ + if (!isLocalBuf) + { + if (mode == RBM_ZERO_AND_LOCK) + LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE); + else if (mode == RBM_ZERO_AND_CLEANUP_LOCK) + LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr)); + } + return BufferDescriptorGetBuffer(bufHdr); } @@ -443,8 +460,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * Read in the page, unless the caller intends to overwrite it and * just wants us to allocate a buffer. */ - if (mode == RBM_ZERO) + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK || + mode == RBM_DO_NOT_USE) + { MemSet((char *) bufBlock, 0, BLCKSZ); + } else { instr_time io_start, @@ -485,6 +505,19 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } } + /* + * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking + * the page as valid, to make sure that no other backend sees the zeroed + * page before the caller has had a chance to initialize it. + * + * Since no-one else can be looking at the page contents yet, there is no + * difference between an exclusive lock and a cleanup-strength lock. + * (Note that we cannot use LockBuffer() of LockBufferForCleanup() here, + * because they assert that the buffer is already valid.) + */ + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE); + if (isLocalBuf) { /* Only need to adjust flags */ -- cgit v1.2.3