aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--contrib/amcheck/verify_nbtree.c582
-rw-r--r--doc/src/sgml/amcheck.sgml6
2 files changed, 465 insertions, 123 deletions
diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 31717321b0d..ceaaa271680 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -95,15 +95,25 @@ typedef struct BtreeCheckState
XLogRecPtr targetlsn;
/*
+ * Low key: high key of left sibling of target page. Used only for child
+ * verification. So, 'lowkey' is kept only when 'readonly' is set.
+ */
+ IndexTuple lowkey;
+
+ /*
+ * The rightlink and incomplete split flag of block one level down to the
+ * target page, which was visited last time via downlink from taget page.
+ * We use it to check for missing downlinks.
+ */
+ BlockNumber prevrightlink;
+ bool previncompletesplit;
+
+ /*
* Mutable state, for optional heapallindexed verification:
*/
/* Bloom filter fingerprints B-Tree index */
bloom_filter *filter;
- /* Bloom filter fingerprints downlink blocks within tree */
- bloom_filter *downlinkfilter;
- /* Right half of incomplete split marker */
- bool rightsplit;
/* Debug counter */
int64 heaptuplespresent;
} BtreeCheckState;
@@ -137,9 +147,14 @@ static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
BtreeLevel level);
static void bt_target_page_check(BtreeCheckState *state);
static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state);
-static void bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
- BlockNumber childblock);
-static void bt_downlink_missing_check(BtreeCheckState *state);
+static void bt_child_check(BtreeCheckState *state, BTScanInsert targetkey,
+ OffsetNumber downlinkoffnum);
+static void bt_child_highkey_check(BtreeCheckState *state,
+ OffsetNumber target_downlinkoffnum,
+ Page loaded_child,
+ uint32 target_level);
+static void bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
+ BlockNumber targetblock, Page target);
static void bt_tuple_present_callback(Relation index, ItemPointer tid,
Datum *values, bool *isnull,
bool tupleIsAlive, void *checkstate);
@@ -470,20 +485,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
errmsg("index \"%s\" cannot be verified using transaction snapshot",
RelationGetRelationName(rel))));
}
- else
- {
- /*
- * Extra readonly downlink check.
- *
- * In readonly case, we know that there cannot be a concurrent
- * page split or a concurrent page deletion, which gives us the
- * opportunity to verify that every non-ignorable page had a
- * downlink one level up. We must be tolerant of interrupted page
- * splits and page deletions, though. This is taken care of in
- * bt_downlink_missing_check().
- */
- state->downlinkfilter = bloom_create(total_pages, work_mem, seed);
- }
}
Assert(!state->rootdescend || state->readonly);
@@ -533,12 +534,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
while (current.leftmost != P_NONE)
{
/*
- * Leftmost page on level cannot be right half of incomplete split.
- * This can go stale immediately in !readonly case.
- */
- state->rightsplit = false;
-
- /*
* Verify this level, and get left most page for next level down, if
* not at leaf level
*/
@@ -561,16 +556,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
IndexInfo *indexinfo = BuildIndexInfo(state->rel);
TableScanDesc scan;
- /* Report on extra downlink checks performed in readonly case */
- if (state->readonly)
- {
- ereport(DEBUG1,
- (errmsg_internal("finished verifying presence of downlink blocks within index \"%s\" with bitset %.2f%% set",
- RelationGetRelationName(rel),
- 100.0 * bloom_prop_bits_set(state->downlinkfilter))));
- bloom_free(state->downlinkfilter);
- }
-
/*
* Create our own scan for table_index_build_scan(), rather than
* getting it to do so for us. This is required so that we can
@@ -673,6 +658,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
level.istruerootlevel ?
" (true root level)" : level.level == 0 ? " (leaf level)" : "");
+ state->prevrightlink = InvalidBlockNumber;
+ state->previncompletesplit = false;
+
do
{
/* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */
@@ -692,9 +680,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
* mode, and since a page has no links within other pages
* (siblings and parent) once it is marked fully deleted, it
* should be impossible to land on a fully deleted page in
- * readonly mode. See bt_downlink_check() for further details.
+ * readonly mode. See bt_child_check() for further details.
*
- * The bt_downlink_check() P_ISDELETED() check is repeated here so
+ * The bt_child_check() P_ISDELETED() check is repeated here so
* that pages that are only reachable through sibling links get
* checked.
*/
@@ -816,21 +804,57 @@ nextpage:
errmsg("circular link chain found in block %u of index \"%s\"",
current, RelationGetRelationName(state->rel))));
+ leftcurrent = current;
+ current = opaque->btpo_next;
+
+ if (state->lowkey)
+ {
+ Assert(state->readonly);
+ pfree(state->lowkey);
+ state->lowkey = NULL;
+ }
+
/*
- * Record if page that is about to become target is the right half of
- * an incomplete page split. This can go stale immediately in
- * !readonly case.
+ * Copy current target high key as the low key of right sibling.
+ * Allocate memory in upper level context, so it would be cleared
+ * after reset of target context.
+ *
+ * We only need the low key in corner cases of checking child high
+ * keys. We use high key only when incomplete split on the child level
+ * falls to the boundary of pages on the target level. See
+ * bt_child_highkey_check() for details. So, typically we won't end
+ * up doing anything with low key, but it's simpler for general case
+ * high key verification to always have it available.
+ *
+ * The correctness of managing low key in the case of concurrent
+ * splits wasn't investigated yet. Thankfully we only need low key
+ * for readonly verification and concurrent splits won't happen.
*/
- state->rightsplit = P_INCOMPLETE_SPLIT(opaque);
+ if (state->readonly && !P_RIGHTMOST(opaque))
+ {
+ IndexTuple itup;
+ ItemId itemid;
- leftcurrent = current;
- current = opaque->btpo_next;
+ itemid = PageGetItemIdCareful(state, state->targetblock,
+ state->target, P_HIKEY);
+ itup = (IndexTuple) PageGetItem(state->target, itemid);
+
+ state->lowkey = MemoryContextAlloc(oldcontext, IndexTupleSize(itup));
+ memcpy(state->lowkey, itup, IndexTupleSize(itup));
+ }
/* Free page and associated memory for this iteration */
MemoryContextReset(state->targetcontext);
}
while (current != P_NONE);
+ if (state->lowkey)
+ {
+ Assert(state->readonly);
+ pfree(state->lowkey);
+ state->lowkey = NULL;
+ }
+
/* Don't change context for caller */
MemoryContextSwitchTo(oldcontext);
@@ -863,7 +887,9 @@ nextpage:
* tuple.
*
* - That downlink to block was encountered in parent where that's expected.
- * (Limited to heapallindexed readonly callers.)
+ * (Limited to readonly callers.)
+ *
+ * - That high keys of child pages matches corresponding pivot keys in parent.
*
* This is also where heapallindexed callers use their Bloom filter to
* fingerprint IndexTuples for later table_index_build_scan() verification.
@@ -981,22 +1007,26 @@ bt_target_page_check(BtreeCheckState *state)
(uint32) state->targetlsn)));
}
- /* Fingerprint downlink blocks in heapallindexed + readonly case */
- if (state->heapallindexed && state->readonly && !P_ISLEAF(topaque))
- {
- BlockNumber childblock = BTreeTupleGetDownLink(itup);
-
- bloom_add_element(state->downlinkfilter,
- (unsigned char *) &childblock,
- sizeof(BlockNumber));
- }
-
/*
* Don't try to generate scankey using "negative infinity" item on
* internal pages. They are always truncated to zero attributes.
*/
if (offset_is_negative_infinity(topaque, offset))
+ {
+ /*
+ * We don't call bt_child_check() for "negative infinity" items.
+ * But if we're performing downlink connectivity check, we do it
+ * for every item including "negative infinity" one.
+ */
+ if (!P_ISLEAF(topaque) && state->readonly)
+ {
+ bt_child_highkey_check(state,
+ offset,
+ NULL,
+ topaque->btpo.level);
+ }
continue;
+ }
/*
* Readonly callers may optionally verify that non-pivot tuples can
@@ -1340,20 +1370,23 @@ bt_target_page_check(BtreeCheckState *state)
* because it has no useful value to compare).
*/
if (!P_ISLEAF(topaque) && state->readonly)
- {
- BlockNumber childblock = BTreeTupleGetDownLink(itup);
-
- bt_downlink_check(state, skey, childblock);
- }
+ bt_child_check(state, skey, offset);
}
/*
- * * Check if page has a downlink in parent *
+ * Special case bt_child_highkey_check() call
*
- * This can only be checked in heapallindexed + readonly case.
+ * We don't pass an real downlink, but we've to finish the level
+ * processing. If condition is satisfied, we've already processed all the
+ * downlinks from the target level. But there still might be pages to the
+ * right of the child page pointer to by our rightmost downlink. And they
+ * might have missing downlinks. This final call checks for them.
*/
- if (state->heapallindexed && state->readonly)
- bt_downlink_missing_check(state);
+ if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly)
+ {
+ bt_child_highkey_check(state, InvalidOffsetNumber,
+ NULL, topaque->btpo.level);
+ }
}
/*
@@ -1497,7 +1530,7 @@ bt_right_page_check_scankey(BtreeCheckState *state)
* Top level tree walk caller moves on to next page (makes it the new
* target) following recovery from this race. (cf. The rationale for
* child/downlink verification needing a ShareLock within
- * bt_downlink_check(), where page deletion is also the main source of
+ * bt_child_check(), where page deletion is also the main source of
* trouble.)
*
* Note that it doesn't matter if right sibling page here is actually a
@@ -1571,6 +1604,297 @@ bt_right_page_check_scankey(BtreeCheckState *state)
}
/*
+ * Check if two tuples are binary identical except the block number. So,
+ * this function is capable to compare pivot keys on different levels.
+ */
+static bool
+bt_pivot_tuple_identical(IndexTuple itup1, IndexTuple itup2)
+{
+ if (IndexTupleSize(itup1) != IndexTupleSize(itup2))
+ return false;
+
+ if (memcmp(&itup1->t_tid.ip_posid, &itup2->t_tid.ip_posid,
+ IndexTupleSize(itup1) - offsetof(ItemPointerData, ip_posid)) != 0)
+ return false;
+
+ return true;
+}
+
+/*---
+ * Check high keys on the child level. Traverse rightlinks from previous
+ * downlink to the current one. Check that there are no intermediate pages
+ * with missing downlinks.
+ *
+ * If 'loaded_child' is given, it's assumed to be the page pointed to by the
+ * downlink referenced by 'downlinkoffnum' of the target page.
+ *
+ * Basically this function is called for each target downlink and checks two
+ * invariants:
+ *
+ * 1) You can reach the next child from previous one via rightlinks;
+ * 2) Each child high key have matching pivot key on target level.
+ *
+ * Consider the sample tree picture.
+ *
+ * 1
+ * / \
+ * 2 <-> 3
+ * / \ / \
+ * 4 <> 5 <> 6 <> 7 <> 8
+ *
+ * This function will be called for blocks 4, 5, 6 and 8. Consider what is
+ * happening for each function call.
+ *
+ * - The function call for block 4 initializes data structure and matches high
+ * key of block 4 to downlink's pivot key of block 2.
+ * - The high key of block 5 is matched to the high key of block 2.
+ * - The block 6 has an incomplete split flag set, so its high key isn't
+ * matched to anything.
+ * - The function call for block 8 checks that block 8 can be found while
+ * following rightlinks from block 6. The high key of block 7 will be
+ * matched to downlink's pivot key in block 3.
+ *
+ * There is also final call of this function, which checks that there is no
+ * missing downlinks for children to the right of the child referenced by
+ * rightmost downlink in target level.
+ */
+static void
+bt_child_highkey_check(BtreeCheckState *state,
+ OffsetNumber target_downlinkoffnum,
+ Page loaded_child,
+ uint32 target_level)
+{
+ BlockNumber blkno = state->prevrightlink;
+ Page page;
+ BTPageOpaque opaque;
+ bool rightsplit = state->previncompletesplit;
+ bool first = true;
+ ItemId itemid;
+ IndexTuple itup;
+ BlockNumber downlink;
+
+ if (OffsetNumberIsValid(target_downlinkoffnum))
+ {
+ itemid = PageGetItemIdCareful(state, state->targetblock,
+ state->target, target_downlinkoffnum);
+ itup = (IndexTuple) PageGetItem(state->target, itemid);
+ downlink = BTreeTupleGetDownLink(itup);
+ }
+ else
+ {
+ downlink = P_NONE;
+ }
+
+ /*
+ * If no previous rightlink is memorized for current level just below
+ * target page's level, we are about to start from the leftmost page. We
+ * can't follow rightlinks from previous page, because there is no
+ * previous page. But we still can match high key.
+ *
+ * So we initialize variables for the loop above like there is previous
+ * page referencing current child. Also we imply previous page to not
+ * have incomplete split flag, that would make us require downlink for
+ * current child. That's correct, because leftmost page on the level
+ * should always have parent downlink.
+ */
+ if (!BlockNumberIsValid(blkno))
+ {
+ blkno = downlink;
+ rightsplit = false;
+ }
+
+ /* Move to the right on the child level */
+ while (true)
+ {
+ /*
+ * Did we traverse the whole tree level and this is check for pages to
+ * the right of rightmost downlink?
+ */
+ if (blkno == P_NONE && downlink == P_NONE)
+ {
+ state->prevrightlink = InvalidBlockNumber;
+ state->previncompletesplit = false;
+ return;
+ }
+
+ /* Did we traverse the whole tree level and don't find next downlink? */
+ if (blkno == P_NONE)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("can't traverse from downlink %u to downlink %u of index \"%s\"",
+ state->prevrightlink, downlink,
+ RelationGetRelationName(state->rel))));
+
+ /* Load page contents */
+ if (blkno == downlink && loaded_child)
+ page = loaded_child;
+ else
+ page = palloc_btree_page(state, blkno);
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ /* The first page we visit at the level should be leftmost */
+ if (first && !BlockNumberIsValid(state->prevrightlink) && !P_LEFTMOST(opaque))
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"",
+ RelationGetRelationName(state->rel)),
+ errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
+ state->targetblock, blkno,
+ (uint32) (state->targetlsn >> 32),
+ (uint32) state->targetlsn)));
+
+ /* Check level for non-ignorable page */
+ if (!P_IGNORE(opaque) && opaque->btpo.level != target_level - 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("block found while following rightlinks from child of index \"%s\" has invalid level",
+ RelationGetRelationName(state->rel)),
+ errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
+ blkno, target_level - 1, opaque->btpo.level)));
+
+ /* Try to detect circular links */
+ if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("circular link chain found in block %u of index \"%s\"",
+ blkno, RelationGetRelationName(state->rel))));
+
+ if (blkno != downlink && !P_IGNORE(opaque))
+ {
+ /* blkno probably has missing parent downlink */
+ bt_downlink_missing_check(state, rightsplit, blkno, page);
+ }
+
+ rightsplit = P_INCOMPLETE_SPLIT(opaque);
+
+ /*
+ * If we visit page with high key, check that it is be equal to the
+ * target key next to corresponding downlink.
+ */
+ if (!rightsplit && !P_RIGHTMOST(opaque))
+ {
+ BTPageOpaque topaque;
+ IndexTuple highkey;
+ OffsetNumber pivotkey_offset;
+
+ /* Get high key */
+ itemid = PageGetItemIdCareful(state, blkno, page, P_HIKEY);
+ highkey = (IndexTuple) PageGetItem(page, itemid);
+
+ /*
+ * There might be two situations when we examine high key. If
+ * current child page is referenced by given target downlink, we
+ * should look to the next offset number for matching key from
+ * target page.
+ *
+ * Alternatively, we're following rightlinks somewhere in the
+ * middle between page referenced by previous target's downlink
+ * and the page referenced by current target's downlink. If
+ * current child page hasn't incomplete split flag set, then its
+ * high key should match to the target's key of current offset
+ * number. This happens when a previous call here (to
+ * bt_child_highkey_check()) found an incomplete split, and we
+ * reach a right sibling page without a downlink -- the right
+ * sibling page's high key still needs to be matched to a
+ * separator key on the parent/target level.
+ *
+ * Don't apply OffsetNumberNext() to target_downlinkoffnum when we
+ * already had to step right on the child level. Our traversal of
+ * the child level must try to move in perfect lockstep behind (to
+ * the left of) the target/parent level traversal.
+ */
+ if (blkno == downlink)
+ pivotkey_offset = OffsetNumberNext(target_downlinkoffnum);
+ else
+ pivotkey_offset = target_downlinkoffnum;
+
+ topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
+
+ if (!offset_is_negative_infinity(topaque, pivotkey_offset))
+ {
+ /*
+ * If we're looking for the next pivot tuple in target page,
+ * but there is no more pivot tuples, then we should match to
+ * high key instead.
+ */
+ if (pivotkey_offset > PageGetMaxOffsetNumber(state->target))
+ {
+ if (P_RIGHTMOST(topaque))
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("child high key is greater than rightmost pivot key on target level in index \"%s\"",
+ RelationGetRelationName(state->rel)),
+ errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
+ state->targetblock, blkno,
+ (uint32) (state->targetlsn >> 32),
+ (uint32) state->targetlsn)));
+ pivotkey_offset = P_HIKEY;
+ }
+ itemid = PageGetItemIdCareful(state, state->targetblock,
+ state->target, pivotkey_offset);
+ itup = (IndexTuple) PageGetItem(state->target, itemid);
+ }
+ else
+ {
+ /*
+ * We cannot try to match child's high key to a negative
+ * infinity key in target, since there is nothing to compare.
+ * However, it's still possible to match child's high key
+ * outside of target page. The reason why we're are is that
+ * bt_child_highkey_check() was previously called for the
+ * cousin page of 'loaded_child', which is incomplete split.
+ * So, now we traverse to the right of that cousin page and
+ * current child level page under consideration still belongs
+ * to the subtree of target's left sibling. Thus, we need to
+ * match child's high key to it's left uncle page high key.
+ * Thankfully we saved it, it's called a "low key" of target
+ * page.
+ */
+ if (!state->lowkey)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("can't find left sibling high key in index \"%s\"",
+ RelationGetRelationName(state->rel)),
+ errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
+ state->targetblock, blkno,
+ (uint32) (state->targetlsn >> 32),
+ (uint32) state->targetlsn)));
+ itup = state->lowkey;
+ }
+
+ if (!bt_pivot_tuple_identical(highkey, itup))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("mismatch between parent key and child high key in index \"%s\"",
+ RelationGetRelationName(state->rel)),
+ errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.",
+ state->targetblock, blkno,
+ (uint32) (state->targetlsn >> 32),
+ (uint32) state->targetlsn)));
+ }
+ }
+
+ /* Exit if we already found next downlink */
+ if (blkno == downlink)
+ {
+ state->prevrightlink = opaque->btpo_next;
+ state->previncompletesplit = rightsplit;
+ return;
+ }
+
+ /* Traverse to the next page using rightlink */
+ blkno = opaque->btpo_next;
+
+ /* Free page contents if it's allocated by us */
+ if (page != loaded_child)
+ pfree(page);
+ first = false;
+ }
+}
+
+/*
* Checks one of target's downlink against its child page.
*
* Conceptually, the target page continues to be what is checked here. The
@@ -1578,15 +1902,28 @@ bt_right_page_check_scankey(BtreeCheckState *state)
* The downlink insertion into the target is probably where any problem raised
* here arises, and there is no such thing as a parent link, so doing the
* verification this way around is much more practical.
+ *
+ * This function visits child page and it's sequentially called for each
+ * downlink of target page. Assuming this we also check downlink connectivity
+ * here in order to save child page visits.
*/
static void
-bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
- BlockNumber childblock)
+bt_child_check(BtreeCheckState *state, BTScanInsert targetkey,
+ OffsetNumber downlinkoffnum)
{
+ ItemId itemid;
+ IndexTuple itup;
+ BlockNumber childblock;
OffsetNumber offset;
OffsetNumber maxoffset;
Page child;
BTPageOpaque copaque;
+ BTPageOpaque topaque;
+
+ itemid = PageGetItemIdCareful(state, state->targetblock,
+ state->target, downlinkoffnum);
+ itup = (IndexTuple) PageGetItem(state->target, itemid);
+ childblock = BTreeTupleGetDownLink(itup);
/*
* Caller must have ShareLock on target relation, because of
@@ -1636,11 +1973,19 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
* Check all items, rather than checking just the first and trusting that
* the operator class obeys the transitive law.
*/
+ topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
child = palloc_btree_page(state, childblock);
copaque = (BTPageOpaque) PageGetSpecialPointer(child);
maxoffset = PageGetMaxOffsetNumber(child);
/*
+ * Since we've already loaded the child block, combine this check with
+ * check for downlink connectivity.
+ */
+ bt_child_highkey_check(state, downlinkoffnum,
+ child, topaque->btpo.level);
+
+ /*
* Since there cannot be a concurrent VACUUM operation in readonly mode,
* and since a page has no links within other pages (siblings and parent)
* once it is marked fully deleted, it should be impossible to land on a
@@ -1734,23 +2079,27 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
* be concerned about concurrent page splits or page deletions.
*/
static void
-bt_downlink_missing_check(BtreeCheckState *state)
+bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit,
+ BlockNumber blkno, Page page)
{
- BTPageOpaque topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
+ BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
ItemId itemid;
IndexTuple itup;
Page child;
BTPageOpaque copaque;
uint32 level;
BlockNumber childblk;
+ XLogRecPtr pagelsn;
- Assert(state->heapallindexed && state->readonly);
- Assert(!P_IGNORE(topaque));
+ Assert(state->readonly);
+ Assert(!P_IGNORE(opaque));
/* No next level up with downlinks to fingerprint from the true root */
- if (P_ISROOT(topaque))
+ if (P_ISROOT(opaque))
return;
+ pagelsn = PageGetLSN(page);
+
/*
* Incomplete (interrupted) page splits can account for the lack of a
* downlink. Some inserting transaction should eventually complete the
@@ -1772,54 +2121,47 @@ bt_downlink_missing_check(BtreeCheckState *state)
* by design, so it shouldn't be taken to indicate corruption. See
* _bt_pagedel() for full details.
*/
- if (state->rightsplit)
+ if (rightsplit)
{
ereport(DEBUG1,
(errcode(ERRCODE_NO_DATA),
errmsg("harmless interrupted page split detected in index %s",
RelationGetRelationName(state->rel)),
errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.",
- state->targetblock, topaque->btpo.level,
- topaque->btpo_prev,
- (uint32) (state->targetlsn >> 32),
- (uint32) state->targetlsn)));
+ blkno, opaque->btpo.level,
+ opaque->btpo_prev,
+ (uint32) (pagelsn >> 32),
+ (uint32) pagelsn)));
return;
}
- /* Target's downlink is typically present in parent/fingerprinted */
- if (!bloom_lacks_element(state->downlinkfilter,
- (unsigned char *) &state->targetblock,
- sizeof(BlockNumber)))
- return;
-
/*
- * Target is probably the "top parent" of a multi-level page deletion.
- * We'll need to descend the subtree to make sure that descendant pages
- * are consistent with that, though.
+ * Page under check is probably the "top parent" of a multi-level page
+ * deletion. We'll need to descend the subtree to make sure that
+ * descendant pages are consistent with that, though.
*
- * If the target page (which must be non-ignorable) is a leaf page, then
- * clearly it can't be the top parent. The lack of a downlink is probably
- * a symptom of a broad problem that could just as easily cause
+ * If the page (which must be non-ignorable) is a leaf page, then clearly
+ * it can't be the top parent. The lack of a downlink is probably a
+ * symptom of a broad problem that could just as easily cause
* inconsistencies anywhere else.
*/
- if (P_ISLEAF(topaque))
+ if (P_ISLEAF(opaque))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("leaf index block lacks downlink in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Block=%u page lsn=%X/%X.",
- state->targetblock,
- (uint32) (state->targetlsn >> 32),
- (uint32) state->targetlsn)));
+ blkno,
+ (uint32) (pagelsn >> 32),
+ (uint32) pagelsn)));
- /* Descend from the target page, which is an internal page */
+ /* Descend from the given page, which is an internal page */
elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",
RelationGetRelationName(state->rel));
- level = topaque->btpo.level;
- itemid = PageGetItemIdCareful(state, state->targetblock, state->target,
- P_FIRSTDATAKEY(topaque));
- itup = (IndexTuple) PageGetItem(state->target, itemid);
+ level = opaque->btpo.level;
+ itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque));
+ itup = (IndexTuple) PageGetItem(page, itemid);
childblk = BTreeTupleGetDownLink(itup);
for (;;)
{
@@ -1837,8 +2179,8 @@ bt_downlink_missing_check(BtreeCheckState *state)
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",
RelationGetRelationName(state->rel)),
- errdetail_internal("Top parent/target block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
- state->targetblock, childblk,
+ errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
+ blkno, childblk,
level - 1, copaque->btpo.level)));
level = copaque->btpo.level;
@@ -1854,12 +2196,12 @@ bt_downlink_missing_check(BtreeCheckState *state)
* Since there cannot be a concurrent VACUUM operation in readonly mode,
* and since a page has no links within other pages (siblings and parent)
* once it is marked fully deleted, it should be impossible to land on a
- * fully deleted page. See bt_downlink_check() for further details.
+ * fully deleted page. See bt_child_check() for further details.
*
- * The bt_downlink_check() P_ISDELETED() check is repeated here because
- * bt_downlink_check() does not visit pages reachable through negative
- * infinity items. Besides, bt_downlink_check() is unwilling to descend
- * multiple levels. (The similar bt_downlink_check() P_ISDELETED() check
+ * The bt_child_check() P_ISDELETED() check is repeated here because
+ * bt_child_check() does not visit pages reachable through negative
+ * infinity items. Besides, bt_child_check() is unwilling to descend
+ * multiple levels. (The similar bt_child_check() P_ISDELETED() check
* within bt_check_level_from_leftmost() won't reach the page either,
* since the leaf's live siblings should have their sibling links updated
* to bypass the deletion target page when it is marked fully dead.)
@@ -1875,26 +2217,26 @@ bt_downlink_missing_check(BtreeCheckState *state)
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("downlink to deleted leaf page found in index \"%s\"",
RelationGetRelationName(state->rel)),
- errdetail_internal("Top parent/target block=%u leaf block=%u top parent/target lsn=%X/%X.",
- state->targetblock, childblk,
- (uint32) (state->targetlsn >> 32),
- (uint32) state->targetlsn)));
+ errdetail_internal("Top parent/target block=%u leaf block=%u top parent/under check lsn=%X/%X.",
+ blkno, childblk,
+ (uint32) (pagelsn >> 32),
+ (uint32) pagelsn)));
/*
* Iff leaf page is half-dead, its high key top parent link should point
* to what VACUUM considered to be the top parent page at the instant it
* was interrupted. Provided the high key link actually points to the
- * target page, the missing downlink we detected is consistent with there
- * having been an interrupted multi-level page deletion. This means that
- * the subtree with the target page at its root (a page deletion chain) is
- * in a consistent state, enabling VACUUM to resume deleting the entire
- * chain the next time it encounters the half-dead leaf page.
+ * page under check, the missing downlink we detected is consistent with
+ * there having been an interrupted multi-level page deletion. This means
+ * that the subtree with the page under check at its root (a page deletion
+ * chain) is in a consistent state, enabling VACUUM to resume deleting the
+ * entire chain the next time it encounters the half-dead leaf page.
*/
if (P_ISHALFDEAD(copaque) && !P_RIGHTMOST(copaque))
{
itemid = PageGetItemIdCareful(state, childblk, child, P_HIKEY);
itup = (IndexTuple) PageGetItem(child, itemid);
- if (BTreeTupleGetTopParent(itup) == state->targetblock)
+ if (BTreeTupleGetTopParent(itup) == blkno)
return;
}
@@ -1903,9 +2245,9 @@ bt_downlink_missing_check(BtreeCheckState *state)
errmsg("internal index block lacks downlink in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Block=%u level=%u page lsn=%X/%X.",
- state->targetblock, topaque->btpo.level,
- (uint32) (state->targetlsn >> 32),
- (uint32) state->targetlsn)));
+ blkno, opaque->btpo.level,
+ (uint32) (pagelsn >> 32),
+ (uint32) pagelsn)));
}
/*
@@ -2146,7 +2488,7 @@ bt_posting_plain_tuple(IndexTuple itup, int n)
* separator key value won't always be available from parent, though, because
* the first items of internal pages are negative infinity items, truncated
* down to zero attributes during internal page splits. While it's true that
- * bt_downlink_check() and the high key check can detect most imaginable key
+ * bt_child_check() and the high key check can detect most imaginable key
* space problems, there are remaining problems it won't detect with non-pivot
* tuples in cousin leaf pages. Starting a search from the root for every
* existing leaf tuple detects small inconsistencies in upper levels of the
diff --git a/doc/src/sgml/amcheck.sgml b/doc/src/sgml/amcheck.sgml
index fe0fe9c186e..c912d2aa895 100644
--- a/doc/src/sgml/amcheck.sgml
+++ b/doc/src/sgml/amcheck.sgml
@@ -125,8 +125,7 @@ ORDER BY c.relpages DESC LIMIT 10;
Optionally, when the <parameter>heapallindexed</parameter>
argument is <literal>true</literal>, the function verifies the
presence of all heap tuples that should be found within the
- index, and that there are no missing downlinks in the index
- structure. When the optional <parameter>rootdescend</parameter>
+ index. When the optional <parameter>rootdescend</parameter>
argument is <literal>true</literal>, verification re-finds
tuples on the leaf level by performing a new search from the
root page for each tuple. The checks that can be performed by
@@ -136,7 +135,8 @@ ORDER BY c.relpages DESC LIMIT 10;
a more thorough variant of <function>bt_index_check</function>:
unlike <function>bt_index_check</function>,
<function>bt_index_parent_check</function> also checks
- invariants that span parent/child relationships.
+ invariants that span parent/child relationships, including checking
+ that there are no missing downlinks in the index structure.
<function>bt_index_parent_check</function> follows the general
convention of raising an error if it finds a logical
inconsistency or other problem.