Allow I/O reliability checks using 16-bit checksums

Checksums are set immediately prior to flush out of shared buffers and checked when pages are read in again. Hint bit setting will require full page write when block is dirtied, which causes various infrastructure changes. Extensive comments, docs and README. WARNING message thrown if checksum fails on non-all zeroes page; ERROR thrown but can be disabled with ignore_checksum_failure = on. Feature enabled by an initdb option, since transition from option off to option on is long and complex and has not yet been implemented. Default is not to use checksums. Checksum used is WAL CRC-32 truncated to 16-bits. Simon Riggs, Jeff Davis, Greg Smith Wide input and assistance from many community members. Thank you.
author: Simon Riggs <simon@2ndQuadrant.com> 2013-03-22 13:54:07 +0000
committer: Simon Riggs <simon@2ndQuadrant.com> 2013-03-22 13:54:07 +0000
commit: 96ef3b8ff1cf1950e897fd2f766d4bd9ef0d5d56 (patch)
tree: 65849014627f4e211c6be8a4e9905b67694ed4ae /src/backend/storage/buffer/bufmgr.c
parent: e4a05c7512b23c8f48c186e685f2ef186374a20a (diff)
download: postgresql-96ef3b8ff1cf1950e897fd2f766d4bd9ef0d5d56.tar.gz
postgresql-96ef3b8ff1cf1950e897fd2f766d4bd9ef0d5d56.zip
1 files changed, 175 insertions, 39 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 0b4c2ed0a01..1cd0ac806a7 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -34,6 +34,7 @@
 #include <unistd.h>
 
 #include "catalog/catalog.h"
+#include "catalog/storage.h"
 #include "common/relpath.h"
 #include "executor/instrument.h"
 #include "miscadmin.h"
@@ -431,6 +432,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	{
 		/* new buffers are zero-filled */
 		MemSet((char *) bufBlock, 0, BLCKSZ);
+		/* don't set checksum for all-zero page */
 		smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
 	}
 	else
@@ -460,13 +462,13 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			}
 
 			/* check for garbage data */
-			if (!PageHeaderIsValid((PageHeader) bufBlock))
+			if (!PageIsVerified((Page) bufBlock, blockNum))
 			{
 				if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
 				{
 					ereport(WARNING,
 							(errcode(ERRCODE_DATA_CORRUPTED),
-							 errmsg("invalid page header in block %u of relation %s; zeroing out page",
+							 errmsg("invalid page in block %u of relation %s; zeroing out page",
 									blockNum,
 									relpath(smgr->smgr_rnode, forkNum))));
 					MemSet((char *) bufBlock, 0, BLCKSZ);
@@ -474,7 +476,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 				else
 					ereport(ERROR,
 							(errcode(ERRCODE_DATA_CORRUPTED),
-					 errmsg("invalid page header in block %u of relation %s",
+					 errmsg("invalid page in block %u of relation %s",
 							blockNum,
 							relpath(smgr->smgr_rnode, forkNum))));
 			}
@@ -655,14 +657,23 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 				 * victim.	We need lock to inspect the page LSN, so this
 				 * can't be done inside StrategyGetBuffer.
 				 */
-				if (strategy != NULL &&
-					XLogNeedsFlush(BufferGetLSN(buf)) &&
-					StrategyRejectBuffer(strategy, buf))
+				if (strategy != NULL)
 				{
-					/* Drop lock/pin and loop around for another buffer */
-					LWLockRelease(buf->content_lock);
-					UnpinBuffer(buf, true);
-					continue;
+					XLogRecPtr	lsn;
+
+					/* Read the LSN while holding buffer header lock */
+					LockBufHdr(buf);
+					lsn = BufferGetLSN(buf);
+					UnlockBufHdr(buf);
+
+					if (XLogNeedsFlush(lsn) &&
+						StrategyRejectBuffer(strategy, buf))
+					{
+						/* Drop lock/pin and loop around for another buffer */
+						LWLockRelease(buf->content_lock);
+						UnpinBuffer(buf, true);
+						continue;
+					}
 				}
 
 				/* OK, do the I/O */
@@ -1906,6 +1917,8 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
 	ErrorContextCallback errcallback;
 	instr_time	io_start,
 				io_time;
+	Block		bufBlock;
+	char		*bufToWrite;
 
 	/*
 	 * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
@@ -1931,6 +1944,18 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
 										reln->smgr_rnode.node.dbNode,
 										reln->smgr_rnode.node.relNode);
 
+	LockBufHdr(buf);
+
+	/*
+	 * Run PageGetLSN while holding header lock, since we don't have the
+	 * buffer locked exclusively in all cases.
+	 */
+	recptr = BufferGetLSN(buf);
+
+	/* To check if block content changes while flushing. - vadim 01/17/97 */
+	buf->flags &= ~BM_JUST_DIRTIED;
+	UnlockBufHdr(buf);
+
 	/*
 	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
 	 * rule that log updates must hit disk before any of the data-file changes
@@ -1949,10 +1974,7 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
 	 * buffer isn't permanent.
 	 */
 	if (buf->flags & BM_PERMANENT)
-	{
-		recptr = BufferGetLSN(buf);
 		XLogFlush(recptr);
-	}
 
 	/*
 	 * Now it's safe to write buffer to disk. Note that no one else should
@@ -1960,18 +1982,20 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln)
 	 * we have the io_in_progress lock.
 	 */
 
-	/* To check if block content changes while flushing. - vadim 01/17/97 */
-	LockBufHdr(buf);
-	buf->flags &= ~BM_JUST_DIRTIED;
-	UnlockBufHdr(buf);
+	bufBlock = BufHdrGetBlock(buf);
+
+	bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
 
 	if (track_io_timing)
 		INSTR_TIME_SET_CURRENT(io_start);
 
+	/*
+	 * bufToWrite is either the shared buffer or a copy, as appropriate.
+	 */
 	smgrwrite(reln,
 			  buf->tag.forkNum,
 			  buf->tag.blockNum,
-			  (char *) BufHdrGetBlock(buf),
+			  bufToWrite,
 			  false);
 
 	if (track_io_timing)
@@ -2042,6 +2066,34 @@ BufferIsPermanent(Buffer buffer)
 	return (bufHdr->flags & BM_PERMANENT) != 0;
 }
 
+/*
+ * BufferGetLSNAtomic
+ *		Retrieves the LSN of the buffer atomically using a buffer header lock.
+ *		This is necessary for some callers who may not have an exclusive lock
+ *		on the buffer.
+ */
+XLogRecPtr
+BufferGetLSNAtomic(Buffer buffer)
+{
+	volatile BufferDesc *bufHdr = &BufferDescriptors[buffer - 1];
+	char				*page = BufferGetPage(buffer);
+	XLogRecPtr			 lsn;
+
+	/* Local buffers don't need a lock. */
+	if (BufferIsLocal(buffer))
+		return PageGetLSN(page);
+
+	/* Make sure we've got a real buffer, and that we hold a pin on it. */
+	Assert(BufferIsValid(buffer));
+	Assert(BufferIsPinned(buffer));
+
+	LockBufHdr(bufHdr);
+	lsn = PageGetLSN(page);
+	UnlockBufHdr(bufHdr);
+
+	return lsn;
+}
+
 /* ---------------------------------------------------------------------
  *		DropRelFileNodeBuffers
  *
@@ -2343,7 +2395,10 @@ FlushRelationBuffers(Relation rel)
 			if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
 				(bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
 			{
-				ErrorContextCallback errcallback;
+				ErrorContextCallback	errcallback;
+				Page					localpage;
+
+				localpage = (char *) LocalBufHdrGetBlock(bufHdr);
 
 				/* Setup error traceback support for ereport() */
 				errcallback.callback = local_buffer_write_error_callback;
@@ -2351,10 +2406,12 @@ FlushRelationBuffers(Relation rel)
 				errcallback.previous = error_context_stack;
 				error_context_stack = &errcallback;
 
+				PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
+
 				smgrwrite(rel->rd_smgr,
 						  bufHdr->tag.forkNum,
 						  bufHdr->tag.blockNum,
-						  (char *) LocalBufHdrGetBlock(bufHdr),
+						  localpage,
 						  false);
 
 				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
@@ -2509,22 +2566,24 @@ IncrBufferRefCount(Buffer buffer)
 }
 
 /*
- * SetBufferCommitInfoNeedsSave
+ * MarkBufferDirtyHint
  *
- *	Mark a buffer dirty when we have updated tuple commit-status bits in it.
+ *	Mark a buffer dirty for non-critical changes.
  *
- * This is essentially the same as MarkBufferDirty, except that the caller
- * might have only share-lock instead of exclusive-lock on the buffer's
- * content lock.  We preserve the distinction mainly as a way of documenting
- * that the caller has not made a critical data change --- the status-bit
- * update could be redone by someone else just as easily.  Therefore, no WAL
- * log record need be generated, whereas calls to MarkBufferDirty really ought
- * to be associated with a WAL-entry-creating action.
+ * This is essentially the same as MarkBufferDirty, except:
+ *
+ * 1. The caller does not write WAL; so if checksums are enabled, we may need
+ *    to write an XLOG_HINT WAL record to protect against torn pages.
+ * 2. The caller might have only share-lock instead of exclusive-lock on the
+ *    buffer's content lock.
+ * 3. This function does not guarantee that the buffer is always marked dirty
+ *    (due to a race condition), so it cannot be used for important changes.
  */
 void
-SetBufferCommitInfoNeedsSave(Buffer buffer)
+MarkBufferDirtyHint(Buffer buffer)
 {
 	volatile BufferDesc *bufHdr;
+	Page	page = BufferGetPage(buffer);
 
 	if (!BufferIsValid(buffer))
 		elog(ERROR, "bad buffer ID: %d", buffer);
@@ -2544,28 +2603,105 @@ SetBufferCommitInfoNeedsSave(Buffer buffer)
 	/*
 	 * This routine might get called many times on the same page, if we are
 	 * making the first scan after commit of an xact that added/deleted many
-	 * tuples.	So, be as quick as we can if the buffer is already dirty.  We
-	 * do this by not acquiring spinlock if it looks like the status bits are
-	 * already.  Since we make this test unlocked, there's a chance we might
-	 * fail to notice that the flags have just been cleared, and failed to
-	 * reset them, due to memory-ordering issues.  But since this function is
-	 * only intended to be used in cases where failing to write out the data
+	 * tuples. So, be as quick as we can if the buffer is already dirty.  We do
+	 * this by not acquiring spinlock if it looks like the status bits are
+	 * already set.  Since we make this test unlocked, there's a chance we
+	 * might fail to notice that the flags have just been cleared, and failed
+	 * to reset them, due to memory-ordering issues.  But since this function
+	 * is only intended to be used in cases where failing to write out the data
 	 * would be harmless anyway, it doesn't really matter.
 	 */
 	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
 		(BM_DIRTY | BM_JUST_DIRTIED))
 	{
+		XLogRecPtr	lsn = InvalidXLogRecPtr;
+		bool		dirtied = false;
+		bool		delayChkpt = false;
+
+		/*
+		 * If checksums are enabled, and the buffer is permanent, then a full
+		 * page image may be required even for some hint bit updates to protect
+		 * against torn pages. This full page image is only necessary if the
+		 * hint bit update is the first change to the page since the last
+		 * checkpoint.
+		 *
+		 * We don't check full_page_writes here because that logic is
+		 * included when we call XLogInsert() since the value changes
+		 * dynamically.
+		 */
+		if (DataChecksumsEnabled() && (bufHdr->flags & BM_PERMANENT))
+		{
+			/*
+			 * If we're in recovery we cannot dirty a page because of a hint.
+			 * We can set the hint, just not dirty the page as a result so
+			 * the hint is lost when we evict the page or shutdown.
+			 *
+			 * See src/backend/storage/page/README for longer discussion.
+			 */
+			if (RecoveryInProgress())
+				return;
+
+			/*
+			 * If the block is already dirty because we either made a change
+			 * or set a hint already, then we don't need to write a full page
+			 * image.  Note that aggressive cleaning of blocks
+			 * dirtied by hint bit setting would increase the call rate.
+			 * Bulk setting of hint bits would reduce the call rate...
+			 *
+			 * We must issue the WAL record before we mark the buffer dirty.
+			 * Otherwise we might write the page before we write the WAL.
+			 * That causes a race condition, since a checkpoint might occur
+			 * between writing the WAL record and marking the buffer dirty.
+			 * We solve that with a kluge, but one that is already in use
+			 * during transaction commit to prevent race conditions.
+			 * Basically, we simply prevent the checkpoint WAL record from
+			 * being written until we have marked the buffer dirty. We don't
+			 * start the checkpoint flush until we have marked dirty, so our
+			 * checkpoint must flush the change to disk successfully or the
+			 * checkpoint never gets written, so crash recovery will fix.
+			 *
+			 * It's possible we may enter here without an xid, so it is
+			 * essential that CreateCheckpoint waits for virtual transactions
+			 * rather than full transactionids.
+			 */
+			MyPgXact->delayChkpt = delayChkpt = true;
+			lsn = XLogSaveBufferForHint(buffer);
+		}
+
 		LockBufHdr(bufHdr);
 		Assert(bufHdr->refcount > 0);
 		if (!(bufHdr->flags & BM_DIRTY))
 		{
-			/* Do vacuum cost accounting */
+			dirtied = true;		/* Means "will be dirtied by this action" */
+
+			/*
+			 * Set the page LSN if we wrote a backup block. We aren't
+			 * supposed to set this when only holding a share lock but
+			 * as long as we serialise it somehow we're OK. We choose to
+			 * set LSN while holding the buffer header lock, which causes
+			 * any reader of an LSN who holds only a share lock to also
+			 * obtain a buffer header lock before using PageGetLSN().
+			 * Fortunately, thats not too many places.
+			 *
+			 * If checksums are enabled, you might think we should reset the
+			 * checksum here. That will happen when the page is written
+			 * sometime later in this checkpoint cycle.
+			 */
+			if (!XLogRecPtrIsInvalid(lsn))
+				PageSetLSN(page, lsn);
+		}
+		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+		UnlockBufHdr(bufHdr);
+
+		if (delayChkpt)
+			MyPgXact->delayChkpt = false;
+
+		if (dirtied)
+		{
 			VacuumPageDirty++;
 			if (VacuumCostActive)
 				VacuumCostBalance += VacuumCostPageDirty;
 		}
-		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
-		UnlockBufHdr(bufHdr);
 	}
 }
author	Simon Riggs <simon@2ndQuadrant.com>	2013-03-22 13:54:07 +0000
committer	Simon Riggs <simon@2ndQuadrant.com>	2013-03-22 13:54:07 +0000
commit	96ef3b8ff1cf1950e897fd2f766d4bd9ef0d5d56 (patch)
tree	65849014627f4e211c6be8a4e9905b67694ed4ae /src/backend/storage/buffer/bufmgr.c
parent	e4a05c7512b23c8f48c186e685f2ef186374a20a (diff)
download	postgresql-96ef3b8ff1cf1950e897fd2f766d4bd9ef0d5d56.tar.gz postgresql-96ef3b8ff1cf1950e897fd2f766d4bd9ef0d5d56.zip