diff options
Diffstat (limited to 'src/backend/access/nbtree/nbtxlog.c')
-rw-r--r-- | src/backend/access/nbtree/nbtxlog.c | 194 |
1 files changed, 110 insertions, 84 deletions
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 055abf90e9e..5fbb481a4fe 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -220,10 +220,9 @@ btree_xlog_insert(bool isleaf, bool ismeta, datalen -= sizeof(xl_btree_metadata); } - if ((record->xl_info & XLR_BKP_BLOCK_1) && !ismeta && isleaf) - return; /* nothing to do */ - - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else { buffer = XLogReadBuffer(xlrec->target.node, ItemPointerGetBlockNumber(&(xlrec->target.tid)), @@ -251,6 +250,13 @@ btree_xlog_insert(bool isleaf, bool ismeta, } } + /* + * Note: in normal operation, we'd update the metapage while still holding + * lock on the page we inserted into. But during replay it's not + * necessary to hold that lock, since no other index updates can be + * happening concurrently, and readers will cope fine with following an + * obsolete link from the metapage. + */ if (ismeta) _bt_restore_meta(xlrec->target.node, lsn, md.root, md.level, @@ -292,7 +298,7 @@ btree_xlog_split(bool onleft, bool isroot, forget_matching_split(xlrec->node, downlink, false); /* Extract left hikey and its size (still assuming 16-bit alignment) */ - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + if (!(record->xl_info & XLR_BKP_BLOCK(0))) { /* We assume 16-bit alignment is enough for IndexTupleSize */ left_hikey = (Item) datapos; @@ -312,7 +318,7 @@ btree_xlog_split(bool onleft, bool isroot, datalen -= sizeof(OffsetNumber); } - if (onleft && !(record->xl_info & XLR_BKP_BLOCK_1)) + if (onleft && !(record->xl_info & XLR_BKP_BLOCK(0))) { /* * We assume that 16-bit alignment is enough to apply IndexTupleSize @@ -325,7 +331,7 @@ btree_xlog_split(bool onleft, bool isroot, datalen -= newitemsz; } - /* Reconstruct right (new) sibling from scratch */ + /* Reconstruct right (new) sibling page from scratch */ rbuf = XLogReadBuffer(xlrec->node, xlrec->rightsib, true); Assert(BufferIsValid(rbuf)); rpage = (Page) BufferGetPage(rbuf); @@ -359,18 +365,21 @@ btree_xlog_split(bool onleft, bool isroot, /* don't release the buffer yet; we touch right page's first item below */ - /* - * Reconstruct left (original) sibling if needed. Note that this code - * ensures that the items remaining on the left page are in the correct - * item number order, but it does not reproduce the physical order they - * would have had. Is this worth changing? See also _bt_restore_page(). - */ - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + /* Now reconstruct left (original) sibling page */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else { Buffer lbuf = XLogReadBuffer(xlrec->node, xlrec->leftsib, false); if (BufferIsValid(lbuf)) { + /* + * Note that this code ensures that the items remaining on the + * left page are in the correct item number order, but it does not + * reproduce the physical order they would have had. Is this + * worth changing? See also _bt_restore_page(). + */ Page lpage = (Page) BufferGetPage(lbuf); BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); @@ -434,8 +443,17 @@ btree_xlog_split(bool onleft, bool isroot, /* We no longer need the right buffer */ UnlockReleaseBuffer(rbuf); - /* Fix left-link of the page to the right of the new right sibling */ - if (xlrec->rnext != P_NONE && !(record->xl_info & XLR_BKP_BLOCK_2)) + /* + * Fix left-link of the page to the right of the new right sibling. + * + * Note: in normal operation, we do this while still holding lock on the + * two split pages. However, that's not necessary for correctness in WAL + * replay, because no other index update can be in progress, and readers + * will cope properly when following an obsolete left-link. + */ + if (record->xl_info & XLR_BKP_BLOCK(1)) + (void) RestoreBackupBlock(lsn, record, 1, false, false); + else if (xlrec->rnext != P_NONE) { Buffer buffer = XLogReadBuffer(xlrec->node, xlrec->rnext, false); @@ -465,13 +483,11 @@ btree_xlog_split(bool onleft, bool isroot, static void btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) { - xl_btree_vacuum *xlrec; + xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque opaque; - xlrec = (xl_btree_vacuum *) XLogRecGetData(record); - /* * If queries might be active then we need to ensure every block is * unpinned between the lastBlockVacuumed and the current block, if there @@ -504,13 +520,14 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) } /* - * If the block was restored from a full page image, nothing more to do. - * The RestoreBkpBlocks() call already pinned and took cleanup lock on it. - * XXX: Perhaps we should call RestoreBkpBlocks() *after* the loop above, - * to make the disk access more sequential. + * If we have a full-page image, restore it (using a cleanup lock) and + * we're done. */ - if (record->xl_info & XLR_BKP_BLOCK_1) + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + (void) RestoreBackupBlock(lsn, record, 0, true, false); return; + } /* * Like in btvacuumpage(), we need to take a cleanup lock on every leaf @@ -565,9 +582,8 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) * XXX optimise later with something like XLogPrefetchBuffer() */ static TransactionId -btree_xlog_delete_get_latestRemovedXid(XLogRecord *record) +btree_xlog_delete_get_latestRemovedXid(xl_btree_delete *xlrec) { - xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); OffsetNumber *unused; Buffer ibuffer, hbuffer; @@ -687,15 +703,35 @@ btree_xlog_delete_get_latestRemovedXid(XLogRecord *record) static void btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record) { - xl_btree_delete *xlrec; + xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque opaque; - if (record->xl_info & XLR_BKP_BLOCK_1) - return; + /* + * If we have any conflict processing to do, it must happen before we + * update the page. + * + * Btree delete records can conflict with standby queries. You might + * think that vacuum records would conflict as well, but we've handled + * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid + * cleaned by the vacuum of the heap and so we can resolve any conflicts + * just once when that arrives. After that we know that no conflicts + * exist from individual btree vacuum records on that index. + */ + if (InHotStandby) + { + TransactionId latestRemovedXid = btree_xlog_delete_get_latestRemovedXid(xlrec); - xlrec = (xl_btree_delete *) XLogRecGetData(record); + ResolveRecoveryConflictWithSnapshot(latestRemovedXid, xlrec->node); + } + + /* If we have a full-page image, restore it and we're done */ + if (record->xl_info & XLR_BKP_BLOCK(0)) + { + (void) RestoreBackupBlock(lsn, record, 0, false, false); + return; + } /* * We don't need to take a cleanup lock to apply these changes. See @@ -751,8 +787,18 @@ btree_xlog_delete_page(uint8 info, XLogRecPtr lsn, XLogRecord *record) leftsib = xlrec->leftblk; rightsib = xlrec->rightblk; + /* + * In normal operation, we would lock all the pages this WAL record + * touches before changing any of them. In WAL replay, it should be okay + * to lock just one page at a time, since no concurrent index updates can + * be happening, and readers should not care whether they arrive at the + * target page or not (since it's surely empty). + */ + /* parent page */ - if (!(record->xl_info & XLR_BKP_BLOCK_1)) + if (record->xl_info & XLR_BKP_BLOCK(0)) + (void) RestoreBackupBlock(lsn, record, 0, false, false); + else { buffer = XLogReadBuffer(xlrec->target.node, parent, false); if (BufferIsValid(buffer)) @@ -798,7 +844,9 @@ btree_xlog_delete_page(uint8 info, XLogRecPtr lsn, XLogRecord *record) } /* Fix left-link of right sibling */ - if (!(record->xl_info & XLR_BKP_BLOCK_2)) + if (record->xl_info & XLR_BKP_BLOCK(1)) + (void) RestoreBackupBlock(lsn, record, 1, false, false); + else { buffer = XLogReadBuffer(xlrec->target.node, rightsib, false); if (BufferIsValid(buffer)) @@ -822,7 +870,9 @@ btree_xlog_delete_page(uint8 info, XLogRecPtr lsn, XLogRecord *record) } /* Fix right-link of left sibling, if any */ - if (!(record->xl_info & XLR_BKP_BLOCK_3)) + if (record->xl_info & XLR_BKP_BLOCK(2)) + (void) RestoreBackupBlock(lsn, record, 2, false, false); + else { if (leftsib != P_NONE) { @@ -896,6 +946,9 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record) BTPageOpaque pageop; BlockNumber downlink = 0; + /* Backup blocks are not used in newroot records */ + Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); + buffer = XLogReadBuffer(xlrec->node, xlrec->rootblk, true); Assert(BufferIsValid(buffer)); page = (Page) BufferGetPage(buffer); @@ -937,63 +990,36 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record) forget_matching_split(xlrec->node, downlink, true); } - -void -btree_redo(XLogRecPtr lsn, XLogRecord *record) +static void +btree_xlog_reuse_page(XLogRecPtr lsn, XLogRecord *record) { - uint8 info = record->xl_info & ~XLR_INFO_MASK; + xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record); + /* + * Btree reuse_page records exist to provide a conflict point when we + * reuse pages in the index via the FSM. That's all they do though. + * + * latestRemovedXid was the page's btpo.xact. The btpo.xact < + * RecentGlobalXmin test in _bt_page_recyclable() conceptually mirrors the + * pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs(). + * Consequently, one XID value achieves the same exclusion effect on + * master and standby. + */ if (InHotStandby) { - switch (info) - { - case XLOG_BTREE_DELETE: - - /* - * Btree delete records can conflict with standby queries. You - * might think that vacuum records would conflict as well, but - * we've handled that already. XLOG_HEAP2_CLEANUP_INFO records - * provide the highest xid cleaned by the vacuum of the heap - * and so we can resolve any conflicts just once when that - * arrives. After that any we know that no conflicts exist - * from individual btree vacuum records on that index. - */ - { - TransactionId latestRemovedXid = btree_xlog_delete_get_latestRemovedXid(record); - xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); - - ResolveRecoveryConflictWithSnapshot(latestRemovedXid, xlrec->node); - } - break; - - case XLOG_BTREE_REUSE_PAGE: - - /* - * Btree reuse page records exist to provide a conflict point - * when we reuse pages in the index via the FSM. That's all it - * does though. latestRemovedXid was the page's btpo.xact. The - * btpo.xact < RecentGlobalXmin test in _bt_page_recyclable() - * conceptually mirrors the pgxact->xmin > limitXmin test in - * GetConflictingVirtualXIDs(). Consequently, one XID value - * achieves the same exclusion effect on master and standby. - */ - { - xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record); + ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, + xlrec->node); + } - ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node); - } - return; + /* Backup blocks are not used in reuse_page records */ + Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); +} - default: - break; - } - } - /* - * Vacuum needs to pin and take cleanup lock on every leaf page, a regular - * exclusive lock is enough for all other purposes. - */ - RestoreBkpBlocks(lsn, record, (info == XLOG_BTREE_VACUUM)); +void +btree_redo(XLogRecPtr lsn, XLogRecord *record) +{ + uint8 info = record->xl_info & ~XLR_INFO_MASK; switch (info) { @@ -1033,7 +1059,7 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record) btree_xlog_newroot(lsn, record); break; case XLOG_BTREE_REUSE_PAGE: - /* Handled above before restoring bkp block */ + btree_xlog_reuse_page(lsn, record); break; default: elog(PANIC, "btree_redo: unknown op code %u", info); |