diff options
Diffstat (limited to 'src/backend/access/nbtree/nbtree.c')
-rw-r--r-- | src/backend/access/nbtree/nbtree.c | 69 |
1 files changed, 41 insertions, 28 deletions
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 75a6ea81d45..b7f9e2e1608 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -56,8 +56,8 @@ typedef struct IndexBulkDeleteCallback callback; void *callback_state; BTCycleId cycleid; - BlockNumber lastBlockVacuumed; /* last blkno reached by Vacuum scan */ - BlockNumber lastUsedPage; /* blkno of last non-recyclable page */ + BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */ + BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */ BlockNumber totFreePages; /* true total # of free pages */ MemoryContext pagedelcontext; } BTVacState; @@ -763,7 +763,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, vstate.callback_state = callback_state; vstate.cycleid = cycleid; vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ - vstate.lastUsedPage = BTREE_METAPAGE; + vstate.lastBlockLocked = BTREE_METAPAGE; vstate.totFreePages = 0; /* Create a temporary memory context to run _bt_pagedel in */ @@ -819,27 +819,30 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, } /* - * InHotStandby we need to scan right up to the end of the index for - * correct locking, so we may need to write a WAL record for the final - * block in the index if it was not vacuumed. It's possible that VACUUMing - * has actually removed zeroed pages at the end of the index so we need to - * take care to issue the record for last actual block and not for the - * last block that was scanned. Ignore empty indexes. + * If the WAL is replayed in hot standby, the replay process needs to get + * cleanup locks on all index leaf pages, just as we've been doing here. + * However, we won't issue any WAL records about pages that have no items + * to be deleted. For pages between pages we've vacuumed, the replay code + * will take locks under the direction of the lastBlockVacuumed fields in + * the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one + * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record + * against the last leaf page in the index, if that one wasn't vacuumed. */ if (XLogStandbyInfoActive() && - num_pages > 1 && vstate.lastBlockVacuumed < (num_pages - 1)) + vstate.lastBlockVacuumed < vstate.lastBlockLocked) { Buffer buf; /* - * We can't use _bt_getbuf() here because it always applies - * _bt_checkpage(), which will barf on an all-zero page. We want to - * recycle all-zero pages, not fail. Also, we want to use a - * nondefault buffer access strategy. + * The page should be valid, but we can't use _bt_getbuf() because we + * want to use a nondefault buffer access strategy. Since we aren't + * going to delete any items, getting cleanup lock again is probably + * overkill, but for consistency do that anyway. */ - buf = ReadBufferExtended(rel, MAIN_FORKNUM, num_pages - 1, RBM_NORMAL, - info->strategy); + buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked, + RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); + _bt_checkpage(rel, buf); _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed); _bt_relbuf(rel, buf); } @@ -914,10 +917,6 @@ restart: } } - /* If the page is in use, update lastUsedPage */ - if (!_bt_page_recyclable(page) && vstate->lastUsedPage < blkno) - vstate->lastUsedPage = blkno; - /* Page is valid, see what to do with it */ if (_bt_page_recyclable(page)) { @@ -954,6 +953,13 @@ restart: LockBufferForCleanup(buf); /* + * Remember highest leaf page number we've taken cleanup lock on; see + * notes in btvacuumscan + */ + if (blkno > vstate->lastBlockLocked) + vstate->lastBlockLocked = blkno; + + /* * Check whether we need to recurse back to earlier pages. What we * are concerned about is a page split that happened since we started * the vacuum scan. If the split moved some tuples to a lower page @@ -1019,19 +1025,26 @@ restart: */ if (ndeletable > 0) { - BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf); - + /* + * Notice that the issued XLOG_BTREE_VACUUM WAL record includes an + * instruction to the replay code to get cleanup lock on all pages + * between the previous lastBlockVacuumed and this page. This + * ensures that WAL replay locks all leaf pages at some point. + * + * Since we can visit leaf pages out-of-order when recursing, + * replay might end up locking such pages an extra time, but it + * doesn't seem worth the amount of bookkeeping it'd take to avoid + * that. + */ _bt_delitems_vacuum(rel, buf, deletable, ndeletable, vstate->lastBlockVacuumed); /* - * Keep track of the block number of the lastBlockVacuumed, so we - * can scan those blocks as well during WAL replay. This then - * provides concurrency protection and allows btrees to be used - * while in recovery. + * Remember highest leaf page number we've issued a + * XLOG_BTREE_VACUUM WAL record for. */ - if (lastBlockVacuumed > vstate->lastBlockVacuumed) - vstate->lastBlockVacuumed = lastBlockVacuumed; + if (blkno > vstate->lastBlockVacuumed) + vstate->lastBlockVacuumed = blkno; stats->tuples_removed += ndeletable; /* must recompute maxoff */ |