aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtxlog.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/nbtree/nbtxlog.c')
-rw-r--r--src/backend/access/nbtree/nbtxlog.c194
1 files changed, 110 insertions, 84 deletions
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 055abf90e9e..5fbb481a4fe 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -220,10 +220,9 @@ btree_xlog_insert(bool isleaf, bool ismeta,
datalen -= sizeof(xl_btree_metadata);
}
- if ((record->xl_info & XLR_BKP_BLOCK_1) && !ismeta && isleaf)
- return; /* nothing to do */
-
- if (!(record->xl_info & XLR_BKP_BLOCK_1))
+ if (record->xl_info & XLR_BKP_BLOCK(0))
+ (void) RestoreBackupBlock(lsn, record, 0, false, false);
+ else
{
buffer = XLogReadBuffer(xlrec->target.node,
ItemPointerGetBlockNumber(&(xlrec->target.tid)),
@@ -251,6 +250,13 @@ btree_xlog_insert(bool isleaf, bool ismeta,
}
}
+ /*
+ * Note: in normal operation, we'd update the metapage while still holding
+ * lock on the page we inserted into. But during replay it's not
+ * necessary to hold that lock, since no other index updates can be
+ * happening concurrently, and readers will cope fine with following an
+ * obsolete link from the metapage.
+ */
if (ismeta)
_bt_restore_meta(xlrec->target.node, lsn,
md.root, md.level,
@@ -292,7 +298,7 @@ btree_xlog_split(bool onleft, bool isroot,
forget_matching_split(xlrec->node, downlink, false);
/* Extract left hikey and its size (still assuming 16-bit alignment) */
- if (!(record->xl_info & XLR_BKP_BLOCK_1))
+ if (!(record->xl_info & XLR_BKP_BLOCK(0)))
{
/* We assume 16-bit alignment is enough for IndexTupleSize */
left_hikey = (Item) datapos;
@@ -312,7 +318,7 @@ btree_xlog_split(bool onleft, bool isroot,
datalen -= sizeof(OffsetNumber);
}
- if (onleft && !(record->xl_info & XLR_BKP_BLOCK_1))
+ if (onleft && !(record->xl_info & XLR_BKP_BLOCK(0)))
{
/*
* We assume that 16-bit alignment is enough to apply IndexTupleSize
@@ -325,7 +331,7 @@ btree_xlog_split(bool onleft, bool isroot,
datalen -= newitemsz;
}
- /* Reconstruct right (new) sibling from scratch */
+ /* Reconstruct right (new) sibling page from scratch */
rbuf = XLogReadBuffer(xlrec->node, xlrec->rightsib, true);
Assert(BufferIsValid(rbuf));
rpage = (Page) BufferGetPage(rbuf);
@@ -359,18 +365,21 @@ btree_xlog_split(bool onleft, bool isroot,
/* don't release the buffer yet; we touch right page's first item below */
- /*
- * Reconstruct left (original) sibling if needed. Note that this code
- * ensures that the items remaining on the left page are in the correct
- * item number order, but it does not reproduce the physical order they
- * would have had. Is this worth changing? See also _bt_restore_page().
- */
- if (!(record->xl_info & XLR_BKP_BLOCK_1))
+ /* Now reconstruct left (original) sibling page */
+ if (record->xl_info & XLR_BKP_BLOCK(0))
+ (void) RestoreBackupBlock(lsn, record, 0, false, false);
+ else
{
Buffer lbuf = XLogReadBuffer(xlrec->node, xlrec->leftsib, false);
if (BufferIsValid(lbuf))
{
+ /*
+ * Note that this code ensures that the items remaining on the
+ * left page are in the correct item number order, but it does not
+ * reproduce the physical order they would have had. Is this
+ * worth changing? See also _bt_restore_page().
+ */
Page lpage = (Page) BufferGetPage(lbuf);
BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
@@ -434,8 +443,17 @@ btree_xlog_split(bool onleft, bool isroot,
/* We no longer need the right buffer */
UnlockReleaseBuffer(rbuf);
- /* Fix left-link of the page to the right of the new right sibling */
- if (xlrec->rnext != P_NONE && !(record->xl_info & XLR_BKP_BLOCK_2))
+ /*
+ * Fix left-link of the page to the right of the new right sibling.
+ *
+ * Note: in normal operation, we do this while still holding lock on the
+ * two split pages. However, that's not necessary for correctness in WAL
+ * replay, because no other index update can be in progress, and readers
+ * will cope properly when following an obsolete left-link.
+ */
+ if (record->xl_info & XLR_BKP_BLOCK(1))
+ (void) RestoreBackupBlock(lsn, record, 1, false, false);
+ else if (xlrec->rnext != P_NONE)
{
Buffer buffer = XLogReadBuffer(xlrec->node, xlrec->rnext, false);
@@ -465,13 +483,11 @@ btree_xlog_split(bool onleft, bool isroot,
static void
btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
{
- xl_btree_vacuum *xlrec;
+ xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
Buffer buffer;
Page page;
BTPageOpaque opaque;
- xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
-
/*
* If queries might be active then we need to ensure every block is
* unpinned between the lastBlockVacuumed and the current block, if there
@@ -504,13 +520,14 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
}
/*
- * If the block was restored from a full page image, nothing more to do.
- * The RestoreBkpBlocks() call already pinned and took cleanup lock on it.
- * XXX: Perhaps we should call RestoreBkpBlocks() *after* the loop above,
- * to make the disk access more sequential.
+ * If we have a full-page image, restore it (using a cleanup lock) and
+ * we're done.
*/
- if (record->xl_info & XLR_BKP_BLOCK_1)
+ if (record->xl_info & XLR_BKP_BLOCK(0))
+ {
+ (void) RestoreBackupBlock(lsn, record, 0, true, false);
return;
+ }
/*
* Like in btvacuumpage(), we need to take a cleanup lock on every leaf
@@ -565,9 +582,8 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
* XXX optimise later with something like XLogPrefetchBuffer()
*/
static TransactionId
-btree_xlog_delete_get_latestRemovedXid(XLogRecord *record)
+btree_xlog_delete_get_latestRemovedXid(xl_btree_delete *xlrec)
{
- xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
OffsetNumber *unused;
Buffer ibuffer,
hbuffer;
@@ -687,15 +703,35 @@ btree_xlog_delete_get_latestRemovedXid(XLogRecord *record)
static void
btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
{
- xl_btree_delete *xlrec;
+ xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
Buffer buffer;
Page page;
BTPageOpaque opaque;
- if (record->xl_info & XLR_BKP_BLOCK_1)
- return;
+ /*
+ * If we have any conflict processing to do, it must happen before we
+ * update the page.
+ *
+ * Btree delete records can conflict with standby queries. You might
+ * think that vacuum records would conflict as well, but we've handled
+ * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
+ * cleaned by the vacuum of the heap and so we can resolve any conflicts
+ * just once when that arrives. After that we know that no conflicts
+ * exist from individual btree vacuum records on that index.
+ */
+ if (InHotStandby)
+ {
+ TransactionId latestRemovedXid = btree_xlog_delete_get_latestRemovedXid(xlrec);
- xlrec = (xl_btree_delete *) XLogRecGetData(record);
+ ResolveRecoveryConflictWithSnapshot(latestRemovedXid, xlrec->node);
+ }
+
+ /* If we have a full-page image, restore it and we're done */
+ if (record->xl_info & XLR_BKP_BLOCK(0))
+ {
+ (void) RestoreBackupBlock(lsn, record, 0, false, false);
+ return;
+ }
/*
* We don't need to take a cleanup lock to apply these changes. See
@@ -751,8 +787,18 @@ btree_xlog_delete_page(uint8 info, XLogRecPtr lsn, XLogRecord *record)
leftsib = xlrec->leftblk;
rightsib = xlrec->rightblk;
+ /*
+ * In normal operation, we would lock all the pages this WAL record
+ * touches before changing any of them. In WAL replay, it should be okay
+ * to lock just one page at a time, since no concurrent index updates can
+ * be happening, and readers should not care whether they arrive at the
+ * target page or not (since it's surely empty).
+ */
+
/* parent page */
- if (!(record->xl_info & XLR_BKP_BLOCK_1))
+ if (record->xl_info & XLR_BKP_BLOCK(0))
+ (void) RestoreBackupBlock(lsn, record, 0, false, false);
+ else
{
buffer = XLogReadBuffer(xlrec->target.node, parent, false);
if (BufferIsValid(buffer))
@@ -798,7 +844,9 @@ btree_xlog_delete_page(uint8 info, XLogRecPtr lsn, XLogRecord *record)
}
/* Fix left-link of right sibling */
- if (!(record->xl_info & XLR_BKP_BLOCK_2))
+ if (record->xl_info & XLR_BKP_BLOCK(1))
+ (void) RestoreBackupBlock(lsn, record, 1, false, false);
+ else
{
buffer = XLogReadBuffer(xlrec->target.node, rightsib, false);
if (BufferIsValid(buffer))
@@ -822,7 +870,9 @@ btree_xlog_delete_page(uint8 info, XLogRecPtr lsn, XLogRecord *record)
}
/* Fix right-link of left sibling, if any */
- if (!(record->xl_info & XLR_BKP_BLOCK_3))
+ if (record->xl_info & XLR_BKP_BLOCK(2))
+ (void) RestoreBackupBlock(lsn, record, 2, false, false);
+ else
{
if (leftsib != P_NONE)
{
@@ -896,6 +946,9 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record)
BTPageOpaque pageop;
BlockNumber downlink = 0;
+ /* Backup blocks are not used in newroot records */
+ Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
+
buffer = XLogReadBuffer(xlrec->node, xlrec->rootblk, true);
Assert(BufferIsValid(buffer));
page = (Page) BufferGetPage(buffer);
@@ -937,63 +990,36 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record)
forget_matching_split(xlrec->node, downlink, true);
}
-
-void
-btree_redo(XLogRecPtr lsn, XLogRecord *record)
+static void
+btree_xlog_reuse_page(XLogRecPtr lsn, XLogRecord *record)
{
- uint8 info = record->xl_info & ~XLR_INFO_MASK;
+ xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record);
+ /*
+ * Btree reuse_page records exist to provide a conflict point when we
+ * reuse pages in the index via the FSM. That's all they do though.
+ *
+ * latestRemovedXid was the page's btpo.xact. The btpo.xact <
+ * RecentGlobalXmin test in _bt_page_recyclable() conceptually mirrors the
+ * pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs().
+ * Consequently, one XID value achieves the same exclusion effect on
+ * master and standby.
+ */
if (InHotStandby)
{
- switch (info)
- {
- case XLOG_BTREE_DELETE:
-
- /*
- * Btree delete records can conflict with standby queries. You
- * might think that vacuum records would conflict as well, but
- * we've handled that already. XLOG_HEAP2_CLEANUP_INFO records
- * provide the highest xid cleaned by the vacuum of the heap
- * and so we can resolve any conflicts just once when that
- * arrives. After that any we know that no conflicts exist
- * from individual btree vacuum records on that index.
- */
- {
- TransactionId latestRemovedXid = btree_xlog_delete_get_latestRemovedXid(record);
- xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
-
- ResolveRecoveryConflictWithSnapshot(latestRemovedXid, xlrec->node);
- }
- break;
-
- case XLOG_BTREE_REUSE_PAGE:
-
- /*
- * Btree reuse page records exist to provide a conflict point
- * when we reuse pages in the index via the FSM. That's all it
- * does though. latestRemovedXid was the page's btpo.xact. The
- * btpo.xact < RecentGlobalXmin test in _bt_page_recyclable()
- * conceptually mirrors the pgxact->xmin > limitXmin test in
- * GetConflictingVirtualXIDs(). Consequently, one XID value
- * achieves the same exclusion effect on master and standby.
- */
- {
- xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record);
+ ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid,
+ xlrec->node);
+ }
- ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
- }
- return;
+ /* Backup blocks are not used in reuse_page records */
+ Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
+}
- default:
- break;
- }
- }
- /*
- * Vacuum needs to pin and take cleanup lock on every leaf page, a regular
- * exclusive lock is enough for all other purposes.
- */
- RestoreBkpBlocks(lsn, record, (info == XLOG_BTREE_VACUUM));
+void
+btree_redo(XLogRecPtr lsn, XLogRecord *record)
+{
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
switch (info)
{
@@ -1033,7 +1059,7 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
btree_xlog_newroot(lsn, record);
break;
case XLOG_BTREE_REUSE_PAGE:
- /* Handled above before restoring bkp block */
+ btree_xlog_reuse_page(lsn, record);
break;
default:
elog(PANIC, "btree_redo: unknown op code %u", info);