aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>2014-11-13 19:47:44 +0200
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>2014-11-13 20:01:09 +0200
commit5f1d931cf3fd3653c3f11835b4aa1dc04edb63ad (patch)
tree1ef36169ba54a211bc23af400b6109032dfc18c6
parent4ddd9e72ffaab50aa755f54e5feffc30a0850930 (diff)
downloadpostgresql-5f1d931cf3fd3653c3f11835b4aa1dc04edb63ad.tar.gz
postgresql-5f1d931cf3fd3653c3f11835b4aa1dc04edb63ad.zip
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out, but not yet locked. If a backend pinned and locked the page in that window, it saw the zeroed page instead of the old page or new page contents, which could lead to missing rows in a result set, or errors. To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins, zeroes, and locks the page, if it's not in the buffer cache already. In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE, to avoid breaking any 3rd party extensions that might use RBM_ZERO. More importantly, this avoids renumbering the other enum values, which would cause even bigger confusion in extensions that use ReadBufferExtended, but haven't been recompiled. Backpatch to all supported versions; this has been racy since hot standby was introduced.
-rw-r--r--src/backend/access/hash/hashpage.c13
-rw-r--r--src/backend/access/heap/heapam.c3
-rw-r--r--src/backend/access/transam/xlog.c6
-rw-r--r--src/backend/access/transam/xlogutils.c17
-rw-r--r--src/backend/storage/buffer/bufmgr.c41
-rw-r--r--src/include/storage/bufmgr.h11
6 files changed, 66 insertions, 25 deletions
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 62e7755634b..0cbe813b04a 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -158,9 +158,8 @@ _hash_getinitbuf(Relation rel, BlockNumber blkno)
if (blkno == P_NEW)
elog(ERROR, "hash AM does not use P_NEW");
- buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO, NULL);
-
- LockBuffer(buf, HASH_WRITE);
+ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK,
+ NULL);
/* ref count and lock type are correct */
@@ -201,11 +200,13 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
if (BufferGetBlockNumber(buf) != blkno)
elog(ERROR, "unexpected hash relation size: %u, should be %u",
BufferGetBlockNumber(buf), blkno);
+ LockBuffer(buf, HASH_WRITE);
}
else
- buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO, NULL);
-
- LockBuffer(buf, HASH_WRITE);
+ {
+ buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK,
+ NULL);
+ }
/* ref count and lock type are correct */
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 0105825e14a..6645732f74c 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -4367,9 +4367,8 @@ heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
* not do anything that assumes we are touching a heap.
*/
buffer = XLogReadBufferExtended(xlrec->node, xlrec->forknum, xlrec->blkno,
- RBM_ZERO);
+ RBM_ZERO_AND_LOCK);
Assert(BufferIsValid(buffer));
- LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
page = (Page) BufferGetPage(buffer);
Assert(record->xl_len == SizeOfHeapNewpage + BLCKSZ);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index dafd7f67919..45028d28288 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -3620,12 +3620,8 @@ RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
{
/* Found it, apply the update */
buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
- RBM_ZERO);
+ get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK);
Assert(BufferIsValid(buffer));
- if (get_cleanup_lock)
- LockBufferForCleanup(buffer);
- else
- LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
page = (Page) BufferGetPage(buffer);
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 3593a4e384d..546e3ded250 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -234,7 +234,8 @@ XLogCheckInvalidPages(void)
* The returned buffer is exclusively-locked.
*
* For historical reasons, instead of a ReadBufferMode argument, this only
- * supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes.
+ * supports RBM_ZERO_AND_LOCK (init == true) and RBM_NORMAL (init == false)
+ * modes.
*/
Buffer
XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
@@ -242,8 +243,8 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
Buffer buf;
buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
- init ? RBM_ZERO : RBM_NORMAL);
- if (BufferIsValid(buf))
+ init ? RBM_ZERO_AND_LOCK : RBM_NORMAL);
+ if (BufferIsValid(buf) && !init)
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
return buf;
@@ -262,8 +263,8 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
* dropped or truncated. If we don't see evidence of that later in the WAL
* sequence, we'll complain at the end of WAL replay.)
*
- * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
- * relation is extended with all-zeroes pages up to the given block number.
+ * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
+ * with all-zeroes pages up to the given block number.
*
* In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
* exist, and we don't check for all-zeroes. Thus, no log entry is made
@@ -317,7 +318,11 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
do
{
if (buffer != InvalidBuffer)
+ {
+ if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
ReleaseBuffer(buffer);
+ }
buffer = ReadBufferWithoutRelcache(rnode, forknum,
P_NEW, mode, NULL);
}
@@ -325,6 +330,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
/* Handle the corner case that P_NEW returns non-consecutive pages */
if (BufferGetBlockNumber(buffer) != blkno)
{
+ if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
ReleaseBuffer(buffer);
buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
mode, NULL);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index cd7108b156d..b57d87d7ac6 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -208,14 +208,19 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
* valid, the page is zeroed instead of throwing an error. This is intended
* for non-critical data, where the caller is prepared to repair errors.
*
- * In RBM_ZERO mode, if the page isn't in buffer cache already, it's filled
- * with zeros instead of reading it from disk. Useful when the caller is
- * going to fill the page from scratch, since this saves I/O and avoids
+ * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
+ * filled with zeros instead of reading it from disk. Useful when the caller
+ * is going to fill the page from scratch, since this saves I/O and avoids
* unnecessary failure if the page-on-disk has corrupt page headers.
+ * The page is returned locked to ensure that the caller has a chance to
+ * initialize the page before it's made visible to others.
* Caution: do not use this mode to read a page that is beyond the relation's
* current physical EOF; that is likely to cause problems in md.c when
* the page is modified and written out. P_NEW is OK, though.
*
+ * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
+ * a cleanup-strength lock on the page.
+ *
* RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
*
* If strategy is not NULL, a nondefault buffer access strategy is used.
@@ -356,6 +361,18 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
isExtend,
found);
+ /*
+ * In RBM_ZERO_AND_LOCK mode, the caller expects the buffer to
+ * be already locked on return.
+ */
+ if (!isLocalBuf)
+ {
+ if (mode == RBM_ZERO_AND_LOCK)
+ LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
+ else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
+ LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
+ }
+
return BufferDescriptorGetBuffer(bufHdr);
}
@@ -436,8 +453,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* Read in the page, unless the caller intends to overwrite it and
* just wants us to allocate a buffer.
*/
- if (mode == RBM_ZERO)
+ if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK ||
+ mode == RBM_DO_NOT_USE)
+ {
MemSet((char *) bufBlock, 0, BLCKSZ);
+ }
else
{
smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
@@ -464,6 +484,19 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
}
}
+ /*
+ * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
+ * the page as valid, to make sure that no other backend sees the zeroed
+ * page before the caller has had a chance to initialize it.
+ *
+ * Since no-one else can be looking at the page contents yet, there is no
+ * difference between an exclusive lock and a cleanup-strength lock.
+ * (Note that we cannot use LockBuffer() of LockBufferForCleanup() here,
+ * because they assert that the buffer is already valid.)
+ */
+ if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+ LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
+
if (isLocalBuf)
{
/* Only need to adjust flags */
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 15d9101edf6..eb85d6f208d 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -36,11 +36,16 @@ typedef enum BufferAccessStrategyType
typedef enum
{
RBM_NORMAL, /* Normal read */
- RBM_ZERO, /* Don't read from disk, caller will
- * initialize */
+ RBM_DO_NOT_USE, /* This used to be RBM_ZERO. Only kept for
+ * binary compatibility with 3rd party
+ * extensions. */
RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */
- RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL
+ RBM_NORMAL_NO_LOG, /* Don't log page as invalid during WAL
* replay; otherwise same as RBM_NORMAL */
+ RBM_ZERO_AND_LOCK, /* Don't read from disk, caller will
+ * initialize. Also locks the page. */
+ RBM_ZERO_AND_CLEANUP_LOCK /* Like RBM_ZERO_AND_LOCK, but locks the page
+ * in "cleanup" mode */
} ReadBufferMode;
/* in globals.c ... this duplicates miscadmin.h */