aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/access/nbtree/README14
-rw-r--r--src/backend/access/nbtree/nbtinsert.c3
-rw-r--r--src/backend/access/nbtree/nbtpage.c462
-rw-r--r--src/backend/access/nbtree/nbtxlog.c8
-rw-r--r--src/include/access/nbtxlog.h6
5 files changed, 278 insertions, 215 deletions
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 2d0f8f4b79a..216d419841c 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -223,9 +223,10 @@ right to make this happen --- a scan moving in the opposite direction
might miss the items if so.) Also, we *never* delete the rightmost page
on a tree level (this restriction simplifies the traversal algorithms, as
explained below). Page deletion always begins from an empty leaf page. An
-internal page can only be deleted as part of a branch leading to a leaf
-page, where each internal page has only one child and that child is also to
-be deleted.
+internal page can only be deleted as part of deleting an entire subtree.
+This is always a "skinny" subtree consisting of a "chain" of internal pages
+plus a single leaf page. There is one page on each level of the subtree,
+and each level/page covers the same key space.
Deleting a leaf page is a two-stage process. In the first stage, the page
is unlinked from its parent, and marked as half-dead. The parent page must
@@ -243,7 +244,12 @@ it, but it's still linked to its siblings.
(Note: Lanin and Shasha prefer to make the key space move left, but their
argument for doing so hinges on not having left-links, which we have
-anyway. So we simplify the algorithm by moving the key space right.)
+anyway. So we simplify the algorithm by moving the key space right. This
+is only possible because we don't match on a separator key when ascending
+the tree during a page split, unlike Lehman and Yao/Lanin and Shasha -- it
+doesn't matter if the downlink is re-found in a pivot tuple whose separator
+key does not match the one encountered when inserter initially descended
+the tree.)
To preserve consistency on the parent level, we cannot merge the key space
of a page into its right sibling unless the right sibling is a child of
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 1c1d4a4bc92..a36b35010e5 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -2278,7 +2278,8 @@ _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
* stack. For example, the checkingunique _bt_doinsert() case may
* have to step right when there are many physical duplicates, and its
* scantid forces an insertion to the right of the "first page the
- * value could be on".
+ * value could be on". (This is also relied on by all of our callers
+ * when dealing with !heapkeyspace indexes.)
*
* Returns write-locked parent page buffer, or InvalidBuffer if pivot
* tuple not found (should not happen). Adjusts bts_blkno &
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index e146cc326c9..10abd61983a 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -39,9 +39,6 @@ static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
TransactionId latestRemovedXid);
static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
OffsetNumber *deletable, int ndeletable);
-static bool _bt_lock_branch_parent(Relation rel, BlockNumber child,
- BTStack stack, Buffer *topparent, OffsetNumber *topoff,
- BlockNumber *target, BlockNumber *rightsib);
static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf,
BTStack stack);
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
@@ -49,6 +46,12 @@ static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
bool *rightsib_empty,
TransactionId *oldestBtpoXact,
uint32 *ndeleted);
+static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child,
+ BTStack stack,
+ Buffer *subtreeparent,
+ OffsetNumber *poffset,
+ BlockNumber *topparent,
+ BlockNumber *topparentrightsib);
/*
* _bt_initmetapage() -- Fill a page buffer with a correct metapage image
@@ -1316,13 +1319,16 @@ _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
/*
* Check that leftsib page (the btpo_prev of target page) is not marked with
- * INCOMPLETE_SPLIT flag.
+ * INCOMPLETE_SPLIT flag. Used during page deletion.
*
* Returning true indicates that page flag is set in leftsib (which is
* definitely still the left sibling of target). When that happens, the
* target doesn't have a downlink in parent, and the page deletion algorithm
* isn't prepared to handle that. Deletion of the target page (or the whole
* subtree that contains the target page) cannot take place.
+ *
+ * Caller should not have a lock on the target page itself, since pages on the
+ * same level must always be locked left to right to avoid deadlocks.
*/
static bool
_bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
@@ -1356,7 +1362,7 @@ _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
/*
* Check that leafrightsib page (the btpo_next of target leaf page) is not
- * marked with ISHALFDEAD flag.
+ * marked with ISHALFDEAD flag. Used during page deletion.
*
* Returning true indicates that page flag is set in leafrightsib, so page
* deletion cannot go ahead. Our caller is not prepared to deal with the case
@@ -1403,121 +1409,6 @@ _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
}
/*
- * Subroutine to find the parent of the branch we're deleting. This climbs
- * up the tree until it finds a page with more than one child, i.e. a page
- * that will not be totally emptied by the deletion. The chain of pages below
- * it, with one downlink each, will form the branch that we need to delete.
- *
- * If we cannot remove the downlink from the parent, because it's the
- * rightmost entry, returns false. On success, *topparent and *topoff are set
- * to the buffer holding the parent, and the offset of the downlink in it.
- * *topparent is write-locked, the caller is responsible for releasing it when
- * done. *target is set to the topmost page in the branch to-be-deleted, i.e.
- * the page whose downlink *topparent / *topoff point to, and *rightsib to its
- * right sibling.
- *
- * "child" is the leaf page we wish to delete, and "stack" is a search stack
- * leading to it (it actually leads to the leftmost leaf page with a high key
- * matching that of the page to be deleted in !heapkeyspace indexes). Note
- * that we will update the stack entry(s) to reflect current downlink
- * positions --- this is essentially the same as the corresponding step of
- * splitting, and is not expected to affect caller. The caller should
- * initialize *target and *rightsib to the leaf page and its right sibling.
- *
- * Note: it's OK to release page locks on any internal pages between the leaf
- * and *topparent, because a safe deletion can't become unsafe due to
- * concurrent activity. An internal page can only acquire an entry if the
- * child is split, but that cannot happen as long as we hold a lock on the
- * leaf.
- */
-static bool
-_bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
- Buffer *topparent, OffsetNumber *topoff,
- BlockNumber *target, BlockNumber *rightsib)
-{
- BlockNumber parent;
- OffsetNumber poffset,
- maxoff;
- Buffer pbuf;
- Page page;
- BTPageOpaque opaque;
-
- /*
- * Locate the downlink of "child" in the parent, updating the stack entry
- * if needed. This is how !heapkeyspace indexes deal with having
- * non-unique high keys in leaf level pages. Even heapkeyspace indexes
- * can have a stale stack due to insertions into the parent.
- */
- pbuf = _bt_getstackbuf(rel, stack, child);
- if (pbuf == InvalidBuffer)
- ereport(ERROR,
- (errcode(ERRCODE_INDEX_CORRUPTED),
- errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
- RelationGetRelationName(rel), child)));
- parent = stack->bts_blkno;
- poffset = stack->bts_offset;
-
- page = BufferGetPage(pbuf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- maxoff = PageGetMaxOffsetNumber(page);
-
- /*
- * If the target is the rightmost child of its parent, then we can't
- * delete, unless it's also the only child.
- */
- Assert(poffset <= maxoff);
- if (poffset >= maxoff)
- {
- /* It's rightmost child... */
- if (poffset == P_FIRSTDATAKEY(opaque))
- {
- BlockNumber leftsibparent;
-
- /*
- * It's only child, so safe if parent would itself be removable.
- * We have to check the parent itself, and then recurse to test
- * the conditions at the parent's parent.
- */
- if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
- P_INCOMPLETE_SPLIT(opaque))
- {
- _bt_relbuf(rel, pbuf);
- return false;
- }
-
- *target = parent;
- *rightsib = opaque->btpo_next;
- leftsibparent = opaque->btpo_prev;
-
- _bt_relbuf(rel, pbuf);
-
- /*
- * Check that the left sibling of parent (if any) is not marked
- * with INCOMPLETE_SPLIT flag before proceeding
- */
- if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
- return false;
-
- return _bt_lock_branch_parent(rel, parent, stack->bts_parent,
- topparent, topoff, target, rightsib);
- }
- else
- {
- /* Unsafe to delete */
- _bt_relbuf(rel, pbuf);
- return false;
- }
- }
- else
- {
- /* Not rightmost child, so safe to delete */
- *topparent = pbuf;
- *topoff = poffset;
- return true;
- }
-}
-
-/*
* _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
*
* This action unlinks the leaf page from the b-tree structure, removing all
@@ -1582,7 +1473,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
/*
* Internal pages are never deleted directly, only as part of deleting
- * the whole branch all the way down to leaf level.
+ * the whole subtree all the way down to leaf level.
*
* Also check for deleted pages here. Caller never passes us a fully
* deleted page. Only VACUUM can delete pages, so there can't have
@@ -1655,8 +1546,8 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
/*
* First, remove downlink pointing to the page (or a parent of the
- * page, if we are going to delete a taller branch), and mark the page
- * as half-dead.
+ * page, if we are going to delete a taller subtree), and mark the
+ * leafbuf page half-dead
*/
if (!P_ISHALFDEAD(opaque))
{
@@ -1675,14 +1566,14 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
BTScanInsert itup_key;
ItemId itemid;
IndexTuple targetkey;
- BlockNumber leftsib, target;
- Buffer lbuf;
+ BlockNumber leftsib, leafblkno;
+ Buffer sleafbuf;
itemid = PageGetItemId(page, P_HIKEY);
targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
leftsib = opaque->btpo_prev;
- target = BufferGetBlockNumber(leafbuf);
+ leafblkno = BufferGetBlockNumber(leafbuf);
/*
* To avoid deadlocks, we'd better drop the leaf page lock
@@ -1694,8 +1585,8 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
* Check that the left sibling of leafbuf (if any) is not
* marked with INCOMPLETE_SPLIT flag before proceeding
*/
- Assert(target == scanblkno);
- if (_bt_leftsib_splitflag(rel, leftsib, target))
+ Assert(leafblkno == scanblkno);
+ if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
{
ReleaseBuffer(leafbuf);
return ndeleted;
@@ -1705,9 +1596,11 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
itup_key = _bt_mkscankey(rel, targetkey);
/* find the leftmost leaf page with matching pivot/high key */
itup_key->pivotsearch = true;
- stack = _bt_search(rel, itup_key, &lbuf, BT_READ, NULL);
+ stack = _bt_search(rel, itup_key, &sleafbuf, BT_READ, NULL);
/* won't need a second lock or pin on leafbuf */
- _bt_relbuf(rel, lbuf);
+ Assert(leafblkno == BufferGetBlockNumber(sleafbuf) ||
+ !itup_key->heapkeyspace);
+ _bt_relbuf(rel, sleafbuf);
/*
* Re-lock the leaf page, and start over to use our stack
@@ -1736,7 +1629,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
/*
* Then unlink it from its siblings. Each call to
- * _bt_unlink_halfdead_page unlinks the topmost page from the branch,
+ * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
* making it shallower. Iterate until the leafbuf page is deleted.
*
* _bt_unlink_halfdead_page should never fail, since we established
@@ -1795,21 +1688,31 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
}
/*
- * First stage of page deletion. Remove the downlink to the top of the
- * branch being deleted, and mark the leaf page as half-dead.
+ * First stage of page deletion.
+ *
+ * Establish the height of the to-be-deleted subtree with leafbuf at its
+ * lowest level, remove the downlink to the subtree, and mark leafbuf
+ * half-dead. The final to-be-deleted subtree is usually just leafbuf itself,
+ * but may include additional internal pages (at most one per level of the
+ * tree below the root).
+ *
+ * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
+ * the rightmost child of its parent (and parent has more than one downlink).
+ * Returns 'true' when the first stage of page deletion completed
+ * successfully.
*/
static bool
_bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
{
BlockNumber leafblkno;
BlockNumber leafrightsib;
- BlockNumber target;
- BlockNumber rightsib;
+ BlockNumber topparent;
+ BlockNumber topparentrightsib;
ItemId itemid;
Page page;
BTPageOpaque opaque;
- Buffer topparent;
- OffsetNumber topoff;
+ Buffer subtreeparent;
+ OffsetNumber poffset;
OffsetNumber nextoffset;
IndexTuple itup;
IndexTupleData trunctuple;
@@ -1817,8 +1720,8 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
page = BufferGetPage(leafbuf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) && !P_ISDELETED(opaque) &&
- !P_ISHALFDEAD(opaque) && P_ISLEAF(opaque) &&
+ Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
+ P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
/*
@@ -1847,47 +1750,57 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
* parent, unless it is the only child --- in which case the parent has to
* be deleted too, and the same condition applies recursively to it. We
* have to check this condition all the way up before trying to delete,
- * and lock the final parent of the to-be-deleted subtree.
+ * and lock the parent of the root of the to-be-deleted subtree (the
+ * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent
+ * for us. We remove the downlink to the "top parent" page (subtree root
+ * page) from the subtree parent page below.
+ *
+ * Initialize topparent to be leafbuf page now. The final to-be-deleted
+ * subtree is often a degenerate one page subtree consisting only of the
+ * leafbuf page. When that happens, the leafbuf page is the final subtree
+ * root page/top parent page.
*/
- rightsib = leafrightsib;
- target = leafblkno;
- if (!_bt_lock_branch_parent(rel, leafblkno, stack,
- &topparent, &topoff, &target, &rightsib))
+ topparent = leafblkno;
+ topparentrightsib = leafrightsib;
+ if (!_bt_lock_subtree_parent(rel, leafblkno, stack,
+ &subtreeparent, &poffset,
+ &topparent, &topparentrightsib))
return false;
/*
* Check that the parent-page index items we're about to delete/overwrite
- * contain what we expect. This can fail if the index has become corrupt
- * for some reason. We want to throw any error before entering the
- * critical section --- otherwise it'd be a PANIC.
- *
- * The test on the target item is just an Assert because
- * _bt_lock_branch_parent should have guaranteed it has the expected
- * contents. The test on the next-child downlink is known to sometimes
- * fail in the field, though.
+ * in subtree parent page contain what we expect. This can fail if the
+ * index has become corrupt for some reason. We want to throw any error
+ * before entering the critical section --- otherwise it'd be a PANIC.
*/
- page = BufferGetPage(topparent);
+ page = BufferGetPage(subtreeparent);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
#ifdef USE_ASSERT_CHECKING
- itemid = PageGetItemId(page, topoff);
+ /*
+ * This is just an assertion because _bt_lock_subtree_parent should have
+ * guaranteed tuple has the expected contents
+ */
+ itemid = PageGetItemId(page, poffset);
itup = (IndexTuple) PageGetItem(page, itemid);
- Assert(BTreeTupleGetDownLink(itup) == target);
+ Assert(BTreeTupleGetDownLink(itup) == topparent);
#endif
- nextoffset = OffsetNumberNext(topoff);
+ nextoffset = OffsetNumberNext(poffset);
itemid = PageGetItemId(page, nextoffset);
itup = (IndexTuple) PageGetItem(page, itemid);
- if (BTreeTupleGetDownLink(itup) != rightsib)
+ if (BTreeTupleGetDownLink(itup) != topparentrightsib)
ereport(ERROR,
- (errcode(ERRCODE_INDEX_CORRUPTED),
- errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
- rightsib, target, BTreeTupleGetDownLink(itup),
- BufferGetBlockNumber(topparent), RelationGetRelationName(rel))));
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
+ topparentrightsib, topparent,
+ BTreeTupleGetDownLink(itup),
+ BufferGetBlockNumber(subtreeparent),
+ RelationGetRelationName(rel))));
/*
* Any insert which would have gone on the leaf block will now go to its
- * right sibling.
+ * right sibling. In other words, the key space moves right.
*/
PredicateLockPageCombine(rel, leafblkno, leafrightsib);
@@ -1895,25 +1808,31 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
START_CRIT_SECTION();
/*
- * Update parent. The normal case is a tad tricky because we want to
- * delete the target's downlink and the *following* key. Easiest way is
- * to copy the right sibling's downlink over the target downlink, and then
- * delete the following item.
+ * Update parent of subtree. We want to delete the downlink to the top
+ * parent page/root of the subtree, and the *following* key. Easiest way
+ * is to copy the right sibling's downlink over the downlink that points
+ * to top parent page, and then delete the right sibling's original pivot
+ * tuple.
+ *
+ * Lanin and Shasha make the key space move left when deleting a page,
+ * whereas the key space moves right here. That's why we cannot simply
+ * delete the pivot tuple with the downlink to the top parent page. See
+ * nbtree/README.
*/
- page = BufferGetPage(topparent);
+ page = BufferGetPage(subtreeparent);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- itemid = PageGetItemId(page, topoff);
+ itemid = PageGetItemId(page, poffset);
itup = (IndexTuple) PageGetItem(page, itemid);
- BTreeTupleSetDownLink(itup, rightsib);
+ BTreeTupleSetDownLink(itup, topparentrightsib);
- nextoffset = OffsetNumberNext(topoff);
+ nextoffset = OffsetNumberNext(poffset);
PageIndexTupleDelete(page, nextoffset);
/*
- * Mark the leaf page as half-dead, and stamp it with a pointer to the
- * highest internal page in the branch we're deleting. We use the tid of
- * the high key to store it.
+ * Mark the leaf page as half-dead, and stamp it with a link to the top
+ * parent page. When the leaf page is also the top parent page, the link
+ * is set to InvalidBlockNumber.
*/
page = BufferGetPage(leafbuf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -1922,8 +1841,8 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
Assert(PageGetMaxOffsetNumber(page) == P_HIKEY);
MemSet(&trunctuple, 0, sizeof(IndexTupleData));
trunctuple.t_info = sizeof(IndexTupleData);
- if (target != leafblkno)
- BTreeTupleSetTopParent(&trunctuple, target);
+ if (topparent != leafblkno)
+ BTreeTupleSetTopParent(&trunctuple, topparent);
else
BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber);
@@ -1932,7 +1851,7 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
elog(ERROR, "could not overwrite high key in half-dead page");
/* Must mark buffers dirty before XLogInsert */
- MarkBufferDirty(topparent);
+ MarkBufferDirty(subtreeparent);
MarkBufferDirty(leafbuf);
/* XLOG stuff */
@@ -1941,16 +1860,16 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
xl_btree_mark_page_halfdead xlrec;
XLogRecPtr recptr;
- xlrec.poffset = topoff;
+ xlrec.poffset = poffset;
xlrec.leafblk = leafblkno;
- if (target != leafblkno)
- xlrec.topparent = target;
+ if (topparent != leafblkno)
+ xlrec.topparent = topparent;
else
xlrec.topparent = InvalidBlockNumber;
XLogBeginInsert();
XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
- XLogRegisterBuffer(1, topparent, REGBUF_STANDARD);
+ XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
page = BufferGetPage(leafbuf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -1961,7 +1880,7 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
- page = BufferGetPage(topparent);
+ page = BufferGetPage(subtreeparent);
PageSetLSN(page, recptr);
page = BufferGetPage(leafbuf);
PageSetLSN(page, recptr);
@@ -1969,17 +1888,21 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
END_CRIT_SECTION();
- _bt_relbuf(rel, topparent);
+ _bt_relbuf(rel, subtreeparent);
return true;
}
/*
- * Unlink a page in a branch of half-dead pages from its siblings.
+ * Second stage of page deletion.
*
- * If the leaf page still has a downlink pointing to it, unlinks the highest
- * parent in the to-be-deleted branch instead of the leaf page. To get rid
- * of the whole branch, including the leaf page itself, iterate until the
- * leaf page is deleted.
+ * Unlinks a single page (in the subtree undergoing deletion) from its
+ * siblings. Also marks the page deleted.
+ *
+ * To get rid of the whole subtree, including the leaf page itself, call here
+ * until the leaf page is deleted. The original "top parent" established in
+ * the first stage of deletion is deleted in the first call here, while the
+ * leaf page is deleted in the last call here. Note that the leaf page itself
+ * is often the initial top parent page.
*
* Returns 'false' if the page could not be unlinked (shouldn't happen). If
* the right sibling of the current target page is empty, *rightsib_empty is
@@ -2028,7 +1951,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
page = BufferGetPage(leafbuf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
+ Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
/*
* Remember some information about the leaf page.
@@ -2047,14 +1970,10 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
*/
CHECK_FOR_INTERRUPTS();
- /*
- * If the leaf page still has a parent pointing to it (or a chain of
- * parents), we don't unlink the leaf page yet, but the topmost remaining
- * parent in the branch (i.e. the "top parent")
- */
+ /* Unlink the current top parent of the subtree */
if (!BlockNumberIsValid(target))
{
- /* No top parent, so target is leaf page */
+ /* Target is leaf page (or leaf page is top parent, if you prefer) */
target = leafblkno;
buf = leafbuf;
@@ -2063,7 +1982,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
}
else
{
- /* Target is the internal page taken from leaf's top parent */
+ /* Target is the internal page taken from leaf's top parent link */
Assert(target != leafblkno);
/* Fetch the block number of the target's left sibling */
@@ -2155,10 +2074,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
* only one vacuum process running at a time.
*/
if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
- {
elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
target, RelationGetRelationName(rel));
- }
+
if (opaque->btpo_prev != leftsib)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
@@ -2180,7 +2098,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
target, RelationGetRelationName(rel));
- /* remember the next non-leaf child down in the branch. */
+ /* Remember the next non-leaf child down in the subtree */
itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
nextchild = BTreeTupleGetDownLink((IndexTuple) PageGetItem(page, itemid));
if (nextchild == leafblkno)
@@ -2265,7 +2183,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
/*
* If we deleted a parent of the targeted leaf page, instead of the leaf
* itself, update the leaf to point to the next remaining child in the
- * branch.
+ * subtree.
*
* Note: We rely on the fact that a buffer pin on the leaf page has been
* held since leafhikey was initialized. This is safe, though only
@@ -2406,12 +2324,150 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
if (target <= scanblkno)
(*ndeleted)++;
- /*
- * Release the target, if it was not the leaf block. The leaf is always
- * kept locked.
- */
+ /* If the target is not leafbuf, we're done with it now -- release it */
if (target != leafblkno)
_bt_relbuf(rel, buf);
return true;
}
+
+/*
+ * Establish how tall the to-be-deleted subtree will be during the first stage
+ * of page deletion.
+ *
+ * Caller's child argument is the block number of the page caller wants to
+ * delete (this is leafbuf's block number, except when we're called
+ * recursively). stack is a search stack leading to it. Note that we will
+ * update the stack entry(s) to reflect current downlink positions --- this is
+ * similar to the corresponding point in page split handling.
+ *
+ * If "first stage" caller cannot go ahead with deleting _any_ pages, returns
+ * false. Returns true on success, in which case caller can use certain
+ * details established here to perform the first stage of deletion. This
+ * function is the last point at which page deletion may be deemed unsafe
+ * (barring index corruption, or unexpected concurrent page deletions).
+ *
+ * We write lock the parent of the root of the to-be-deleted subtree for
+ * caller on success (i.e. we leave our lock on the *subtreeparent buffer for
+ * caller). Caller will have to remove a downlink from *subtreeparent. We
+ * also set a *subtreeparent offset number in *poffset, to indicate the
+ * location of the pivot tuple that contains the relevant downlink.
+ *
+ * The root of the to-be-deleted subtree is called the "top parent". Note
+ * that the leafbuf page is often the final "top parent" page (you can think
+ * of the leafbuf page as a degenerate single page subtree when that happens).
+ * Caller should initialize *topparent to the target leafbuf page block number
+ * (while *topparentrightsib should be set to leafbuf's right sibling block
+ * number). We will update *topparent (and *topparentrightsib) for caller
+ * here, though only when it turns out that caller will delete at least one
+ * internal page (i.e. only when caller needs to store a valid link to the top
+ * parent block in the leafbuf page using BTreeTupleSetTopParent()).
+ */
+static bool
+_bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack,
+ Buffer *subtreeparent, OffsetNumber *poffset,
+ BlockNumber *topparent, BlockNumber *topparentrightsib)
+{
+ BlockNumber parent, leftsibparent;
+ OffsetNumber parentoffset,
+ maxoff;
+ Buffer pbuf;
+ Page page;
+ BTPageOpaque opaque;
+
+ /*
+ * Locate the pivot tuple whose downlink points to "child". Write lock
+ * the parent page itself.
+ */
+ pbuf = _bt_getstackbuf(rel, stack, child);
+ if (pbuf == InvalidBuffer)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
+ RelationGetRelationName(rel), child)));
+ parent = stack->bts_blkno;
+ parentoffset = stack->bts_offset;
+
+ page = BufferGetPage(pbuf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ maxoff = PageGetMaxOffsetNumber(page);
+ leftsibparent = opaque->btpo_prev;
+
+ /*
+ * _bt_getstackbuf() completes page splits on returned parent buffer when
+ * required.
+ *
+ * In general it's a bad idea for VACUUM to use up more disk space, which
+ * is why page deletion does not finish incomplete page splits most of the
+ * time. We allow this limited exception because the risk is much lower,
+ * and the potential downside of not proceeding is much higher: A single
+ * internal page with the INCOMPLETE_SPLIT flag set might otherwise
+ * prevent us from deleting hundreds of empty leaf pages from one level
+ * down.
+ */
+ Assert(!P_INCOMPLETE_SPLIT(opaque));
+
+ if (parentoffset < maxoff)
+ {
+ /*
+ * Child is not the rightmost child in parent, so it's safe to delete
+ * the subtree whose root/topparent is child page
+ */
+ *subtreeparent = pbuf;
+ *poffset = parentoffset;
+ return true;
+ }
+
+ /*
+ * Child is the rightmost child of parent.
+ *
+ * Since it's the rightmost child of parent, deleting the child (or
+ * deleting the subtree whose root/topparent is the child page) is only
+ * safe when it's also possible to delete the parent.
+ */
+ Assert(parentoffset == maxoff);
+ if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
+ {
+ /*
+ * Child isn't parent's only child, or parent is rightmost on its
+ * entire level. Definitely cannot delete any pages.
+ */
+ _bt_relbuf(rel, pbuf);
+ return false;
+ }
+
+ /*
+ * Now make sure that the parent deletion is itself safe by examining the
+ * child's grandparent page. Recurse, passing the parent page as the
+ * child page (child's grandparent is the parent on the next level up).
+ * If parent deletion is unsafe, then child deletion must also be unsafe
+ * (in which case caller cannot delete any pages at all).
+ */
+ *topparent = parent;
+ *topparentrightsib = opaque->btpo_next;
+
+ /*
+ * Release lock on parent before recursing.
+ *
+ * It's OK to release page locks on parent before recursive call locks
+ * grandparent. An internal page can only acquire an entry if the child
+ * is split, but that cannot happen as long as we still hold a lock on the
+ * leafbuf page.
+ */
+ _bt_relbuf(rel, pbuf);
+
+ /*
+ * Before recursing, check that the left sibling of parent (if any) is not
+ * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
+ * parent lock).
+ *
+ * Note: We deliberately avoid completing incomplete splits here.
+ */
+ if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
+ return false;
+
+ /* Recurse to examine child page's grandparent page */
+ return _bt_lock_subtree_parent(rel, parent, stack->bts_parent,
+ subtreeparent, poffset,
+ topparent, topparentrightsib);
+}
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 493931e1b83..87a8612c28c 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -704,7 +704,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
* target page or not (since it's surely empty).
*/
- /* parent page */
+ /* to-be-deleted subtree's parent page */
if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
{
OffsetNumber poffset;
@@ -749,8 +749,8 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
pageop->btpo_cycleid = 0;
/*
- * Construct a dummy hikey item that points to the next parent to be
- * deleted (if any).
+ * Construct a dummy high key item that points to top parent page (value
+ * is InvalidBlockNumber when the top parent page is the leaf page itself)
*/
MemSet(&trunctuple, 0, sizeof(IndexTupleData));
trunctuple.t_info = sizeof(IndexTupleData);
@@ -837,7 +837,7 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
/*
* If we deleted a parent of the targeted leaf page, instead of the leaf
* itself, update the leaf to point to the next remaining child in the
- * branch.
+ * to-be-deleted subtree
*/
if (XLogRecHasBlockRef(record, 3))
{
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index ec7b2a6c33e..5c014bdc664 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -250,7 +250,7 @@ typedef struct xl_btree_vacuum
#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16))
/*
- * This is what we need to know about marking an empty branch for deletion.
+ * This is what we need to know about marking an empty subtree for deletion.
* The target identifies the tuple removed from the parent page (note that we
* remove this tuple's downlink and the *following* tuple's key). Note that
* the leaf page is empty, so we don't need to store its content --- it is
@@ -267,7 +267,7 @@ typedef struct xl_btree_mark_page_halfdead
BlockNumber leafblk; /* leaf block ultimately being deleted */
BlockNumber leftblk; /* leaf block's left sibling, if any */
BlockNumber rightblk; /* leaf block's right sibling */
- BlockNumber topparent; /* topmost internal page in the branch */
+ BlockNumber topparent; /* topmost internal page in the subtree */
} xl_btree_mark_page_halfdead;
#define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
@@ -294,7 +294,7 @@ typedef struct xl_btree_unlink_page
*/
BlockNumber leafleftsib;
BlockNumber leafrightsib;
- BlockNumber topparent; /* next child down in the branch */
+ BlockNumber topparent; /* next child down in the subtree */
TransactionId btpo_xact; /* value of btpo.xact for use in recovery */
/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */