aboutsummaryrefslogtreecommitdiff
path: root/src/backend
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend')
-rw-r--r--src/backend/access/brin/brin_xlog.c20
-rw-r--r--src/backend/access/common/Makefile4
-rw-r--r--src/backend/access/common/bufmask.c128
-rw-r--r--src/backend/access/gin/ginxlog.c32
-rw-r--r--src/backend/access/gist/gistxlog.c43
-rw-r--r--src/backend/access/heap/heapam.c79
-rw-r--r--src/backend/access/nbtree/nbtxlog.c50
-rw-r--r--src/backend/access/rmgrdesc/gindesc.c14
-rw-r--r--src/backend/access/spgist/spgxlog.c21
-rw-r--r--src/backend/access/transam/generic_xlog.c12
-rw-r--r--src/backend/access/transam/rmgr.c4
-rw-r--r--src/backend/access/transam/xlog.c120
-rw-r--r--src/backend/access/transam/xloginsert.c38
-rw-r--r--src/backend/access/transam/xlogreader.c8
-rw-r--r--src/backend/access/transam/xlogutils.c11
-rw-r--r--src/backend/commands/sequence.c12
-rw-r--r--src/backend/utils/misc/guc.c97
17 files changed, 677 insertions, 16 deletions
diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c
index b698c9b58c5..f416bacc3f7 100644
--- a/src/backend/access/brin/brin_xlog.c
+++ b/src/backend/access/brin/brin_xlog.c
@@ -13,6 +13,7 @@
#include "access/brin_page.h"
#include "access/brin_pageops.h"
#include "access/brin_xlog.h"
+#include "access/bufmask.h"
#include "access/xlogutils.h"
@@ -279,3 +280,22 @@ brin_redo(XLogReaderState *record)
elog(PANIC, "brin_redo: unknown op code %u", info);
}
}
+
+/*
+ * Mask a BRIN page before doing consistency checks.
+ */
+void
+brin_mask(char *pagedata, BlockNumber blkno)
+{
+ Page page = (Page) pagedata;
+
+ mask_page_lsn(page);
+
+ mask_page_hint_bits(page);
+
+ if (BRIN_IS_REGULAR_PAGE(page))
+ {
+ /* Regular brin pages contain unused space which needs to be masked. */
+ mask_unused_space(page);
+ }
+}
diff --git a/src/backend/access/common/Makefile b/src/backend/access/common/Makefile
index d4b8132a973..fb27944b891 100644
--- a/src/backend/access/common/Makefile
+++ b/src/backend/access/common/Makefile
@@ -12,7 +12,7 @@ subdir = src/backend/access/common
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
-OBJS = heaptuple.o indextuple.o printsimple.o printtup.o reloptions.o \
- scankey.o tupconvert.o tupdesc.o
+OBJS = bufmask.o heaptuple.o indextuple.o printsimple.o printtup.o \
+ reloptions.o scankey.o tupconvert.o tupdesc.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c
new file mode 100644
index 00000000000..3b06115e035
--- /dev/null
+++ b/src/backend/access/common/bufmask.c
@@ -0,0 +1,128 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufmask.c
+ * Routines for buffer masking. Used to mask certain bits
+ * in a page which can be different when the WAL is generated
+ * and when the WAL is applied.
+ *
+ * Portions Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * Contains common routines required for masking a page.
+ *
+ * IDENTIFICATION
+ * src/backend/storage/buffer/bufmask.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/bufmask.h"
+
+/*
+ * mask_page_lsn
+ *
+ * In consistency checks, the LSN of the two pages compared will likely be
+ * different because of concurrent operations when the WAL is generated
+ * and the state of the page when WAL is applied.
+ */
+void
+mask_page_lsn(Page page)
+{
+ PageHeader phdr = (PageHeader) page;
+
+ PageXLogRecPtrSet(phdr->pd_lsn, (uint64) MASK_MARKER);
+}
+
+/*
+ * mask_page_hint_bits
+ *
+ * Mask hint bits in PageHeader. We want to ignore differences in hint bits,
+ * since they can be set without emitting any WAL.
+ */
+void
+mask_page_hint_bits(Page page)
+{
+ PageHeader phdr = (PageHeader) page;
+
+ /* Ignore prune_xid (it's like a hint-bit) */
+ phdr->pd_prune_xid = MASK_MARKER;
+
+ /* Ignore PD_PAGE_FULL and PD_HAS_FREE_LINES flags, they are just hints. */
+ PageClearFull(page);
+ PageClearHasFreeLinePointers(page);
+
+ /*
+ * During replay, if the page LSN has advanced past our XLOG record's LSN,
+ * we don't mark the page all-visible. See heap_xlog_visible() for
+ * details.
+ */
+ PageClearAllVisible(page);
+}
+
+/*
+ * mask_unused_space
+ *
+ * Mask the unused space of a page between pd_lower and pd_upper.
+ */
+void
+mask_unused_space(Page page)
+{
+ int pd_lower = ((PageHeader) page)->pd_lower;
+ int pd_upper = ((PageHeader) page)->pd_upper;
+ int pd_special = ((PageHeader) page)->pd_special;
+
+ /* Sanity check */
+ if (pd_lower > pd_upper || pd_special < pd_upper ||
+ pd_lower < SizeOfPageHeaderData || pd_special > BLCKSZ)
+ {
+ elog(ERROR, "invalid page pd_lower %u pd_upper %u pd_special %u\n",
+ pd_lower, pd_upper, pd_special);
+ }
+
+ memset(page + pd_lower, MASK_MARKER, pd_upper - pd_lower);
+}
+
+/*
+ * mask_lp_flags
+ *
+ * In some index AMs, line pointer flags can be modified in master without
+ * emitting any WAL record.
+ */
+void
+mask_lp_flags(Page page)
+{
+ OffsetNumber offnum,
+ maxoff;
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (offnum = FirstOffsetNumber;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemId = PageGetItemId(page, offnum);
+
+ if (ItemIdIsUsed(itemId))
+ itemId->lp_flags = LP_UNUSED;
+ }
+}
+
+/*
+ * mask_page_content
+ *
+ * In some index AMs, the contents of deleted pages need to be almost
+ * completely ignored.
+ */
+void
+mask_page_content(Page page)
+{
+ /* Mask Page Content */
+ memset(page + SizeOfPageHeaderData, MASK_MARKER,
+ BLCKSZ - SizeOfPageHeaderData);
+
+ /* Mask pd_lower and pd_upper */
+ memset(&((PageHeader) page)->pd_lower, MASK_MARKER,
+ sizeof(uint16));
+ memset(&((PageHeader) page)->pd_upper, MASK_MARKER,
+ sizeof(uint16));
+}
diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
index 8468fe825cf..2995e7b06a7 100644
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -13,6 +13,7 @@
*/
#include "postgres.h"
+#include "access/bufmask.h"
#include "access/gin_private.h"
#include "access/xlogutils.h"
#include "utils/memutils.h"
@@ -758,3 +759,34 @@ gin_xlog_cleanup(void)
MemoryContextDelete(opCtx);
opCtx = NULL;
}
+
+/*
+ * Mask a GIN page before running consistency checks on it.
+ */
+void
+gin_mask(char *pagedata, BlockNumber blkno)
+{
+ Page page = (Page) pagedata;
+ GinPageOpaque opaque;
+
+ mask_page_lsn(page);
+ opaque = GinPageGetOpaque(page);
+
+ mask_page_hint_bits(page);
+
+ /*
+ * GIN metapage doesn't use pd_lower/pd_upper. Other page types do. Hence,
+ * we need to apply masking for those pages.
+ */
+ if (opaque->flags != GIN_META)
+ {
+ /*
+ * For GIN_DELETED page, the page is initialized to empty. Hence, mask
+ * the page content.
+ */
+ if (opaque->flags & GIN_DELETED)
+ mask_page_content(page);
+ else
+ mask_unused_space(page);
+ }
+}
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 88b97a4e487..cbda9e705cc 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -13,6 +13,7 @@
*/
#include "postgres.h"
+#include "access/bufmask.h"
#include "access/gist_private.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
@@ -343,6 +344,48 @@ gist_xlog_cleanup(void)
}
/*
+ * Mask a Gist page before running consistency checks on it.
+ */
+void
+gist_mask(char *pagedata, BlockNumber blkno)
+{
+ Page page = (Page) pagedata;
+
+ mask_page_lsn(page);
+
+ mask_page_hint_bits(page);
+ mask_unused_space(page);
+
+ /*
+ * NSN is nothing but a special purpose LSN. Hence, mask it for the same
+ * reason as mask_page_lsn.
+ */
+ GistPageSetNSN(page, (uint64) MASK_MARKER);
+
+ /*
+ * We update F_FOLLOW_RIGHT flag on the left child after writing WAL
+ * record. Hence, mask this flag. See gistplacetopage() for details.
+ */
+ GistMarkFollowRight(page);
+
+ if (GistPageIsLeaf(page))
+ {
+ /*
+ * In gist leaf pages, it is possible to modify the LP_FLAGS without
+ * emitting any WAL record. Hence, mask the line pointer flags. See
+ * gistkillitems() for details.
+ */
+ mask_lp_flags(page);
+ }
+
+ /*
+ * During gist redo, we never mark a page as garbage. Hence, mask it to
+ * ignore any differences.
+ */
+ GistClearPageHasGarbage(page);
+}
+
+/*
* Write WAL record of a page split.
*/
XLogRecPtr
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 5fd7f1e1a20..0be48fb3ee1 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -38,6 +38,7 @@
*/
#include "postgres.h"
+#include "access/bufmask.h"
#include "access/heapam.h"
#include "access/heapam_xlog.h"
#include "access/hio.h"
@@ -9142,3 +9143,81 @@ heap_sync(Relation rel)
heap_close(toastrel, AccessShareLock);
}
}
+
+/*
+ * Mask a heap page before performing consistency checks on it.
+ */
+void
+heap_mask(char *pagedata, BlockNumber blkno)
+{
+ Page page = (Page) pagedata;
+ OffsetNumber off;
+
+ mask_page_lsn(page);
+
+ mask_page_hint_bits(page);
+ mask_unused_space(page);
+
+ for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
+ {
+ ItemId iid = PageGetItemId(page, off);
+ char *page_item;
+
+ page_item = (char *) (page + ItemIdGetOffset(iid));
+
+ if (ItemIdIsNormal(iid))
+ {
+
+ HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
+
+ /*
+ * If xmin of a tuple is not yet frozen, we should ignore
+ * differences in hint bits, since they can be set without
+ * emitting WAL.
+ */
+ if (!HeapTupleHeaderXminFrozen(page_htup))
+ page_htup->t_infomask &= ~HEAP_XACT_MASK;
+ else
+ {
+ /* Still we need to mask xmax hint bits. */
+ page_htup->t_infomask &= ~HEAP_XMAX_INVALID;
+ page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED;
+ }
+
+ /*
+ * During replay, we set Command Id to FirstCommandId. Hence, mask
+ * it. See heap_xlog_insert() for details.
+ */
+ page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER;
+
+ /*
+ * For a speculative tuple, heap_insert() does not set ctid in the
+ * caller-passed heap tuple itself, leaving the ctid field to
+ * contain a speculative token value - a per-backend monotonically
+ * increasing identifier. Besides, it does not WAL-log ctid under
+ * any circumstances.
+ *
+ * During redo, heap_xlog_insert() sets t_ctid to current block
+ * number and self offset number. It doesn't care about any
+ * speculative insertions in master. Hence, we set t_ctid to
+ * current block number and self offset number to ignore any
+ * inconsistency.
+ */
+ if (HeapTupleHeaderIsSpeculative(page_htup))
+ ItemPointerSet(&page_htup->t_ctid, blkno, off);
+ }
+
+ /*
+ * Ignore any padding bytes after the tuple, when the length of the
+ * item is not MAXALIGNed.
+ */
+ if (ItemIdHasStorage(iid))
+ {
+ int len = ItemIdGetLength(iid);
+ int padlen = MAXALIGN(len) - len;
+
+ if (padlen > 0)
+ memset(page_item + len, MASK_MARKER, padlen);
+ }
+ }
+}
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index efad745c57e..a9ca279d813 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -14,6 +14,7 @@
*/
#include "postgres.h"
+#include "access/bufmask.h"
#include "access/heapam_xlog.h"
#include "access/nbtree.h"
#include "access/transam.h"
@@ -1028,3 +1029,52 @@ btree_redo(XLogReaderState *record)
elog(PANIC, "btree_redo: unknown op code %u", info);
}
}
+
+/*
+ * Mask a btree page before performing consistency checks on it.
+ */
+void
+btree_mask(char *pagedata, BlockNumber blkno)
+{
+ Page page = (Page) pagedata;
+ BTPageOpaque maskopaq;
+
+ mask_page_lsn(page);
+
+ mask_page_hint_bits(page);
+ mask_unused_space(page);
+
+ maskopaq = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ if (P_ISDELETED(maskopaq))
+ {
+ /*
+ * Mask page content on a DELETED page since it will be re-initialized
+ * during replay. See btree_xlog_unlink_page() for details.
+ */
+ mask_page_content(page);
+ }
+ else if (P_ISLEAF(maskopaq))
+ {
+ /*
+ * In btree leaf pages, it is possible to modify the LP_FLAGS without
+ * emitting any WAL record. Hence, mask the line pointer flags. See
+ * _bt_killitems(), _bt_check_unique() for details.
+ */
+ mask_lp_flags(page);
+ }
+
+ /*
+ * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See
+ * _bt_killitems(), _bt_check_unique() for details.
+ */
+ maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+ /*
+ * During replay of a btree page split, we don't set the BTP_SPLIT_END
+ * flag of the right sibling and initialize the cycle_id to 0 for the same
+ * page. See btree_xlog_split() for details.
+ */
+ maskopaq->btpo_flags &= ~BTP_SPLIT_END;
+ maskopaq->btpo_cycleid = 0;
+}
diff --git a/src/backend/access/rmgrdesc/gindesc.c b/src/backend/access/rmgrdesc/gindesc.c
index 9e488b359af..d4ed7f9c0ab 100644
--- a/src/backend/access/rmgrdesc/gindesc.c
+++ b/src/backend/access/rmgrdesc/gindesc.c
@@ -105,7 +105,12 @@ gin_desc(StringInfo buf, XLogReaderState *record)
leftChildBlkno, rightChildBlkno);
}
if (XLogRecHasBlockImage(record, 0))
- appendStringInfoString(buf, " (full page image)");
+ {
+ if (XLogRecBlockImageApply(record, 0))
+ appendStringInfoString(buf, " (full page image)");
+ else
+ appendStringInfoString(buf, " (full page image, for WAL verification)");
+ }
else
{
char *payload = XLogRecGetBlockData(record, 0, NULL);
@@ -145,7 +150,12 @@ gin_desc(StringInfo buf, XLogReaderState *record)
case XLOG_GIN_VACUUM_DATA_LEAF_PAGE:
{
if (XLogRecHasBlockImage(record, 0))
- appendStringInfoString(buf, " (full page image)");
+ {
+ if (XLogRecBlockImageApply(record, 0))
+ appendStringInfoString(buf, " (full page image)");
+ else
+ appendStringInfoString(buf, " (full page image, for WAL verification)");
+ }
else
{
ginxlogVacuumDataLeafPage *xlrec =
diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c
index 3dc6a5ab881..596b266ba64 100644
--- a/src/backend/access/spgist/spgxlog.c
+++ b/src/backend/access/spgist/spgxlog.c
@@ -14,6 +14,7 @@
*/
#include "postgres.h"
+#include "access/bufmask.h"
#include "access/spgist_private.h"
#include "access/transam.h"
#include "access/xlog.h"
@@ -1023,3 +1024,23 @@ spg_xlog_cleanup(void)
MemoryContextDelete(opCtx);
opCtx = NULL;
}
+
+/*
+ * Mask a SpGist page before performing consistency checks on it.
+ */
+void
+spg_mask(char *pagedata, BlockNumber blkno)
+{
+ Page page = (Page) pagedata;
+
+ mask_page_lsn(page);
+
+ mask_page_hint_bits(page);
+
+ /*
+ * Any SpGist page other than meta contains unused space which needs to be
+ * masked.
+ */
+ if (!SpGistPageIsMeta(page))
+ mask_unused_space(page);
+}
diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c
index eddec9bc548..fbc6810c2f2 100644
--- a/src/backend/access/transam/generic_xlog.c
+++ b/src/backend/access/transam/generic_xlog.c
@@ -13,6 +13,7 @@
*/
#include "postgres.h"
+#include "access/bufmask.h"
#include "access/generic_xlog.h"
#include "access/xlogutils.h"
#include "miscadmin.h"
@@ -533,3 +534,14 @@ generic_redo(XLogReaderState *record)
UnlockReleaseBuffer(buffers[block_id]);
}
}
+
+/*
+ * Mask a generic page before performing consistency checks on it.
+ */
+void
+generic_mask(char *page, BlockNumber blkno)
+{
+ mask_page_lsn(page);
+
+ mask_unused_space(page);
+}
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 9bb136218d5..eae75242fea 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -30,8 +30,8 @@
#include "utils/relmapper.h"
/* must be kept in sync with RmgrData definition in xlog_internal.h */
-#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \
- { name, redo, desc, identify, startup, cleanup },
+#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
+ { name, redo, desc, identify, startup, cleanup, mask },
const RmgrData RmgrTable[RM_MAX_ID + 1] = {
#include "access/rmgrlist.h"
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 2f5d6030660..cc8b83fa8d6 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -95,6 +95,8 @@ bool EnableHotStandby = false;
bool fullPageWrites = true;
bool wal_log_hints = false;
bool wal_compression = false;
+char *wal_consistency_checking_string = NULL;
+bool *wal_consistency_checking = NULL;
bool log_checkpoints = false;
int sync_method = DEFAULT_SYNC_METHOD;
int wal_level = WAL_LEVEL_MINIMAL;
@@ -245,6 +247,10 @@ bool InArchiveRecovery = false;
/* Was the last xlog file restored from archive, or local? */
static bool restoredFromArchive = false;
+/* Buffers dedicated to consistency checks of size BLCKSZ */
+static char *replay_image_masked = NULL;
+static char *master_image_masked = NULL;
+
/* options taken from recovery.conf for archive recovery */
char *recoveryRestoreCommand = NULL;
static char *recoveryEndCommand = NULL;
@@ -903,6 +909,7 @@ static char *GetXLogBuffer(XLogRecPtr ptr);
static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
+static void checkXLogConsistency(XLogReaderState *record);
static void WALInsertLockAcquire(void);
static void WALInsertLockAcquireExclusive(void);
@@ -1315,6 +1322,103 @@ ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
}
/*
+ * Checks whether the current buffer page and backup page stored in the
+ * WAL record are consistent or not. Before comparing the two pages, a
+ * masking can be applied to the pages to ignore certain areas like hint bits,
+ * unused space between pd_lower and pd_upper among other things. This
+ * function should be called once WAL replay has been completed for a
+ * given record.
+ */
+static void
+checkXLogConsistency(XLogReaderState *record)
+{
+ RmgrId rmid = XLogRecGetRmid(record);
+ RelFileNode rnode;
+ ForkNumber forknum;
+ BlockNumber blkno;
+ int block_id;
+
+ /* Records with no backup blocks have no need for consistency checks. */
+ if (!XLogRecHasAnyBlockRefs(record))
+ return;
+
+ Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
+
+ for (block_id = 0; block_id <= record->max_block_id; block_id++)
+ {
+ Buffer buf;
+ Page page;
+
+ if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
+ {
+ /*
+ * WAL record doesn't contain a block reference with the given id.
+ * Do nothing.
+ */
+ continue;
+ }
+
+ Assert(XLogRecHasBlockImage(record, block_id));
+
+ /*
+ * Read the contents from the current buffer and store it in a
+ * temporary page.
+ */
+ buf = XLogReadBufferExtended(rnode, forknum, blkno,
+ RBM_NORMAL_NO_LOG);
+ if (!BufferIsValid(buf))
+ continue;
+
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ page = BufferGetPage(buf);
+
+ /*
+ * Take a copy of the local page where WAL has been applied to have a
+ * comparison base before masking it...
+ */
+ memcpy(replay_image_masked, page, BLCKSZ);
+
+ /* No need for this page anymore now that a copy is in. */
+ UnlockReleaseBuffer(buf);
+
+ /*
+ * If the block LSN is already ahead of this WAL record, we can't
+ * expect contents to match. This can happen if recovery is restarted.
+ */
+ if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
+ continue;
+
+ /*
+ * Read the contents from the backup copy, stored in WAL record and
+ * store it in a temporary page. There is not need to allocate a new
+ * page here, a local buffer is fine to hold its contents and a mask
+ * can be directly applied on it.
+ */
+ if (!RestoreBlockImage(record, block_id, master_image_masked))
+ elog(ERROR, "failed to restore block image");
+
+ /*
+ * If masking function is defined, mask both the master and replay
+ * images
+ */
+ if (RmgrTable[rmid].rm_mask != NULL)
+ {
+ RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
+ RmgrTable[rmid].rm_mask(master_image_masked, blkno);
+ }
+
+ /* Time to compare the master and replay images. */
+ if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
+ {
+ elog(FATAL,
+ "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
+ rnode.spcNode, rnode.dbNode, rnode.relNode,
+ forknum, blkno);
+ }
+ }
+}
+
+/*
* Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved
* area in the WAL.
*/
@@ -6200,6 +6304,13 @@ StartupXLOG(void)
errdetail("Failed while allocating an XLog reading processor.")));
xlogreader->system_identifier = ControlFile->system_identifier;
+ /*
+ * Allocate pages dedicated to WAL consistency checks, those had better
+ * be aligned.
+ */
+ replay_image_masked = (char *) palloc(BLCKSZ);
+ master_image_masked = (char *) palloc(BLCKSZ);
+
if (read_backup_label(&checkPointLoc, &backupEndRequired,
&backupFromStandby))
{
@@ -7000,6 +7111,15 @@ StartupXLOG(void)
/* Now apply the WAL record itself */
RmgrTable[record->xl_rmid].rm_redo(xlogreader);
+ /*
+ * After redo, check whether the backup pages associated with
+ * the WAL record are consistent with the existing pages. This
+ * check is done only if consistency check is enabled for this
+ * record.
+ */
+ if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
+ checkXLogConsistency(xlogreader);
+
/* Pop the error context stack */
error_context_stack = errcallback.previous;
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index a5aa58d845d..797e68cd901 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -421,10 +421,12 @@ XLogInsert(RmgrId rmid, uint8 info)
elog(ERROR, "XLogBeginInsert was not called");
/*
- * The caller can set rmgr bits and XLR_SPECIAL_REL_UPDATE; the rest are
- * reserved for use by me.
+ * The caller can set rmgr bits, XLR_SPECIAL_REL_UPDATE and
+ * XLR_CHECK_CONSISTENCY; the rest are reserved for use by me.
*/
- if ((info & ~(XLR_RMGR_INFO_MASK | XLR_SPECIAL_REL_UPDATE)) != 0)
+ if ((info & ~(XLR_RMGR_INFO_MASK |
+ XLR_SPECIAL_REL_UPDATE |
+ XLR_CHECK_CONSISTENCY)) != 0)
elog(PANIC, "invalid xlog info mask %02X", info);
TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
@@ -505,6 +507,15 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
hdr_rdt.data = hdr_scratch;
/*
+ * Enforce consistency checks for this record if user is looking for
+ * it. Do this before at the beginning of this routine to give the
+ * possibility for callers of XLogInsert() to pass XLR_CHECK_CONSISTENCY
+ * directly for a record.
+ */
+ if (wal_consistency_checking[rmid])
+ info |= XLR_CHECK_CONSISTENCY;
+
+ /*
* Make an rdata chain containing all the data portions of all block
* references. This includes the data for full-page images. Also append
* the headers for the block references in the scratch buffer.
@@ -520,6 +531,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
XLogRecordBlockCompressHeader cbimg = {0};
bool samerel;
bool is_compressed = false;
+ bool include_image;
if (!regbuf->in_use)
continue;
@@ -563,7 +575,14 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
if ((regbuf->flags & REGBUF_WILL_INIT) == REGBUF_WILL_INIT)
bkpb.fork_flags |= BKPBLOCK_WILL_INIT;
- if (needs_backup)
+ /*
+ * If needs_backup is true or WAL checking is enabled for
+ * current resource manager, log a full-page write for the current
+ * block.
+ */
+ include_image = needs_backup || (info & XLR_CHECK_CONSISTENCY) != 0;
+
+ if (include_image)
{
Page page = regbuf->page;
uint16 compressed_len;
@@ -625,6 +644,15 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
bimg.bimg_info = (cbimg.hole_length == 0) ? 0 : BKPIMAGE_HAS_HOLE;
+ /*
+ * If WAL consistency checking is enabled for the resource manager of
+ * this WAL record, a full-page image is included in the record
+ * for the block modified. During redo, the full-page is replayed
+ * only if BKPIMAGE_APPLY is set.
+ */
+ if (needs_backup)
+ bimg.bimg_info |= BKPIMAGE_APPLY;
+
if (is_compressed)
{
bimg.length = compressed_len;
@@ -687,7 +715,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
/* Ok, copy the header to the scratch buffer */
memcpy(scratch, &bkpb, SizeOfXLogRecordBlockHeader);
scratch += SizeOfXLogRecordBlockHeader;
- if (needs_backup)
+ if (include_image)
{
memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader);
scratch += SizeOfXLogRecordBlockImageHeader;
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index b528745fe85..f077662946f 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -997,6 +997,7 @@ ResetDecoder(XLogReaderState *state)
state->blocks[block_id].in_use = false;
state->blocks[block_id].has_image = false;
state->blocks[block_id].has_data = false;
+ state->blocks[block_id].apply_image = false;
}
state->max_block_id = -1;
}
@@ -1089,6 +1090,7 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg)
blk = &state->blocks[block_id];
blk->in_use = true;
+ blk->apply_image = false;
COPY_HEADER_FIELD(&fork_flags, sizeof(uint8));
blk->forknum = fork_flags & BKPBLOCK_FORK_MASK;
@@ -1120,6 +1122,9 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg)
COPY_HEADER_FIELD(&blk->bimg_len, sizeof(uint16));
COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16));
COPY_HEADER_FIELD(&blk->bimg_info, sizeof(uint8));
+
+ blk->apply_image = ((blk->bimg_info & BKPIMAGE_APPLY) != 0);
+
if (blk->bimg_info & BKPIMAGE_IS_COMPRESSED)
{
if (blk->bimg_info & BKPIMAGE_HAS_HOLE)
@@ -1243,6 +1248,9 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg)
if (!blk->in_use)
continue;
+
+ Assert(blk->has_image || !blk->apply_image);
+
if (blk->has_image)
{
blk->bkp_image = ptr;
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 0de2419e54b..6627f5498b9 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -275,9 +275,9 @@ XLogCheckInvalidPages(void)
* will complain if we don't have the lock. In hot standby mode it's
* definitely necessary.)
*
- * Note: when a backup block is available in XLOG, we restore it
- * unconditionally, even if the page in the database appears newer. This is
- * to protect ourselves against database pages that were partially or
+ * Note: when a backup block is available in XLOG with the BKPIMAGE_APPLY flag
+ * set, we restore it, even if the page in the database appears newer. This
+ * is to protect ourselves against database pages that were partially or
* incorrectly written during a crash. We assume that the XLOG data must be
* good because it has passed a CRC check, while the database page might not
* be. This will force us to replay all subsequent modifications of the page
@@ -352,9 +352,10 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
if (!willinit && zeromode)
elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record");
- /* If it's a full-page image, restore it. */
- if (XLogRecHasBlockImage(record, block_id))
+ /* If it has a full-page image and it should be restored, do it. */
+ if (XLogRecBlockImageApply(record, block_id))
{
+ Assert(XLogRecHasBlockImage(record, block_id));
*buf = XLogReadBufferExtended(rnode, forknum, blkno,
get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK);
page = BufferGetPage(*buf);
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index c148b09cd72..e6f87543df8 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -14,6 +14,7 @@
*/
#include "postgres.h"
+#include "access/bufmask.h"
#include "access/htup_details.h"
#include "access/multixact.h"
#include "access/transam.h"
@@ -1740,3 +1741,14 @@ ResetSequenceCaches(void)
last_used_seq = NULL;
}
+
+/*
+ * Mask a Sequence page before performing consistency checks on it.
+ */
+void
+seq_mask(char *page, BlockNumber blkno)
+{
+ mask_page_lsn(page);
+
+ mask_unused_space(page);
+}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index c53aededcb4..de85eca6a8f 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -28,9 +28,11 @@
#include "access/commit_ts.h"
#include "access/gin.h"
+#include "access/rmgr.h"
#include "access/transam.h"
#include "access/twophase.h"
#include "access/xact.h"
+#include "access/xlog_internal.h"
#include "catalog/namespace.h"
#include "commands/async.h"
#include "commands/prepare.h"
@@ -147,6 +149,10 @@ static bool call_enum_check_hook(struct config_enum * conf, int *newval,
static bool check_log_destination(char **newval, void **extra, GucSource source);
static void assign_log_destination(const char *newval, void *extra);
+static bool check_wal_consistency_checking(char **newval, void **extra,
+ GucSource source);
+static void assign_wal_consistency_checking(const char *newval, void *extra);
+
#ifdef HAVE_SYSLOG
static int syslog_facility = LOG_LOCAL0;
#else
@@ -3572,6 +3578,17 @@ static struct config_string ConfigureNamesString[] =
check_cluster_name, NULL, NULL
},
+ {
+ {"wal_consistency_checking", PGC_SUSET, DEVELOPER_OPTIONS,
+ gettext_noop("Sets the WAL resource managers for which WAL consistency checks are done."),
+ gettext_noop("Full-page images will be logged for all data blocks and cross-checked against the results of WAL replay."),
+ GUC_LIST_INPUT | GUC_NOT_IN_SAMPLE
+ },
+ &wal_consistency_checking_string,
+ "",
+ check_wal_consistency_checking, assign_wal_consistency_checking, NULL
+ },
+
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL
@@ -9889,6 +9906,86 @@ call_enum_check_hook(struct config_enum * conf, int *newval, void **extra,
*/
static bool
+check_wal_consistency_checking(char **newval, void **extra, GucSource source)
+{
+ char *rawstring;
+ List *elemlist;
+ ListCell *l;
+ bool newwalconsistency[RM_MAX_ID + 1];
+
+ /* Initialize the array */
+ MemSet(newwalconsistency, 0, (RM_MAX_ID + 1) * sizeof(bool));
+
+ /* Need a modifiable copy of string */
+ rawstring = pstrdup(*newval);
+
+ /* Parse string into list of identifiers */
+ if (!SplitIdentifierString(rawstring, ',', &elemlist))
+ {
+ /* syntax error in list */
+ GUC_check_errdetail("List syntax is invalid.");
+ pfree(rawstring);
+ list_free(elemlist);
+ return false;
+ }
+
+ foreach(l, elemlist)
+ {
+ char *tok = (char *) lfirst(l);
+ bool found = false;
+ RmgrId rmid;
+
+ /* Check for 'all'. */
+ if (pg_strcasecmp(tok, "all") == 0)
+ {
+ for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
+ if (RmgrTable[rmid].rm_mask != NULL)
+ newwalconsistency[rmid] = true;
+ found = true;
+ }
+ else
+ {
+ /*
+ * Check if the token matches with any individual resource
+ * manager.
+ */
+ for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
+ {
+ if (pg_strcasecmp(tok, RmgrTable[rmid].rm_name) == 0 &&
+ RmgrTable[rmid].rm_mask != NULL)
+ {
+ newwalconsistency[rmid] = true;
+ found = true;
+ }
+ }
+ }
+
+ /* If a valid resource manager is found, check for the next one. */
+ if (!found)
+ {
+ GUC_check_errdetail("Unrecognized key word: \"%s\".", tok);
+ pfree(rawstring);
+ list_free(elemlist);
+ return false;
+ }
+ }
+
+ pfree(rawstring);
+ list_free(elemlist);
+
+ /* assign new value */
+ *extra = guc_malloc(ERROR, (RM_MAX_ID + 1) * sizeof(bool));
+ memcpy(*extra, newwalconsistency, (RM_MAX_ID + 1) * sizeof(bool));
+ return true;
+}
+
+static void
+assign_wal_consistency_checking(const char *newval, void *extra)
+{
+ wal_consistency_checking = (bool *) extra;
+}
+
+static bool
check_log_destination(char **newval, void **extra, GucSource source)
{
char *rawstring;