diff options
-rw-r--r-- | contrib/amcheck/verify_nbtree.c | 582 | ||||
-rw-r--r-- | doc/src/sgml/amcheck.sgml | 6 |
2 files changed, 465 insertions, 123 deletions
diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 31717321b0d..ceaaa271680 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -95,15 +95,25 @@ typedef struct BtreeCheckState XLogRecPtr targetlsn; /* + * Low key: high key of left sibling of target page. Used only for child + * verification. So, 'lowkey' is kept only when 'readonly' is set. + */ + IndexTuple lowkey; + + /* + * The rightlink and incomplete split flag of block one level down to the + * target page, which was visited last time via downlink from taget page. + * We use it to check for missing downlinks. + */ + BlockNumber prevrightlink; + bool previncompletesplit; + + /* * Mutable state, for optional heapallindexed verification: */ /* Bloom filter fingerprints B-Tree index */ bloom_filter *filter; - /* Bloom filter fingerprints downlink blocks within tree */ - bloom_filter *downlinkfilter; - /* Right half of incomplete split marker */ - bool rightsplit; /* Debug counter */ int64 heaptuplespresent; } BtreeCheckState; @@ -137,9 +147,14 @@ static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level); static void bt_target_page_check(BtreeCheckState *state); static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state); -static void bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, - BlockNumber childblock); -static void bt_downlink_missing_check(BtreeCheckState *state); +static void bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, + OffsetNumber downlinkoffnum); +static void bt_child_highkey_check(BtreeCheckState *state, + OffsetNumber target_downlinkoffnum, + Page loaded_child, + uint32 target_level); +static void bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, + BlockNumber targetblock, Page target); static void bt_tuple_present_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *checkstate); @@ -470,20 +485,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, errmsg("index \"%s\" cannot be verified using transaction snapshot", RelationGetRelationName(rel)))); } - else - { - /* - * Extra readonly downlink check. - * - * In readonly case, we know that there cannot be a concurrent - * page split or a concurrent page deletion, which gives us the - * opportunity to verify that every non-ignorable page had a - * downlink one level up. We must be tolerant of interrupted page - * splits and page deletions, though. This is taken care of in - * bt_downlink_missing_check(). - */ - state->downlinkfilter = bloom_create(total_pages, work_mem, seed); - } } Assert(!state->rootdescend || state->readonly); @@ -533,12 +534,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, while (current.leftmost != P_NONE) { /* - * Leftmost page on level cannot be right half of incomplete split. - * This can go stale immediately in !readonly case. - */ - state->rightsplit = false; - - /* * Verify this level, and get left most page for next level down, if * not at leaf level */ @@ -561,16 +556,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, IndexInfo *indexinfo = BuildIndexInfo(state->rel); TableScanDesc scan; - /* Report on extra downlink checks performed in readonly case */ - if (state->readonly) - { - ereport(DEBUG1, - (errmsg_internal("finished verifying presence of downlink blocks within index \"%s\" with bitset %.2f%% set", - RelationGetRelationName(rel), - 100.0 * bloom_prop_bits_set(state->downlinkfilter)))); - bloom_free(state->downlinkfilter); - } - /* * Create our own scan for table_index_build_scan(), rather than * getting it to do so for us. This is required so that we can @@ -673,6 +658,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) level.istruerootlevel ? " (true root level)" : level.level == 0 ? " (leaf level)" : ""); + state->prevrightlink = InvalidBlockNumber; + state->previncompletesplit = false; + do { /* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */ @@ -692,9 +680,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level) * mode, and since a page has no links within other pages * (siblings and parent) once it is marked fully deleted, it * should be impossible to land on a fully deleted page in - * readonly mode. See bt_downlink_check() for further details. + * readonly mode. See bt_child_check() for further details. * - * The bt_downlink_check() P_ISDELETED() check is repeated here so + * The bt_child_check() P_ISDELETED() check is repeated here so * that pages that are only reachable through sibling links get * checked. */ @@ -816,21 +804,57 @@ nextpage: errmsg("circular link chain found in block %u of index \"%s\"", current, RelationGetRelationName(state->rel)))); + leftcurrent = current; + current = opaque->btpo_next; + + if (state->lowkey) + { + Assert(state->readonly); + pfree(state->lowkey); + state->lowkey = NULL; + } + /* - * Record if page that is about to become target is the right half of - * an incomplete page split. This can go stale immediately in - * !readonly case. + * Copy current target high key as the low key of right sibling. + * Allocate memory in upper level context, so it would be cleared + * after reset of target context. + * + * We only need the low key in corner cases of checking child high + * keys. We use high key only when incomplete split on the child level + * falls to the boundary of pages on the target level. See + * bt_child_highkey_check() for details. So, typically we won't end + * up doing anything with low key, but it's simpler for general case + * high key verification to always have it available. + * + * The correctness of managing low key in the case of concurrent + * splits wasn't investigated yet. Thankfully we only need low key + * for readonly verification and concurrent splits won't happen. */ - state->rightsplit = P_INCOMPLETE_SPLIT(opaque); + if (state->readonly && !P_RIGHTMOST(opaque)) + { + IndexTuple itup; + ItemId itemid; - leftcurrent = current; - current = opaque->btpo_next; + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, P_HIKEY); + itup = (IndexTuple) PageGetItem(state->target, itemid); + + state->lowkey = MemoryContextAlloc(oldcontext, IndexTupleSize(itup)); + memcpy(state->lowkey, itup, IndexTupleSize(itup)); + } /* Free page and associated memory for this iteration */ MemoryContextReset(state->targetcontext); } while (current != P_NONE); + if (state->lowkey) + { + Assert(state->readonly); + pfree(state->lowkey); + state->lowkey = NULL; + } + /* Don't change context for caller */ MemoryContextSwitchTo(oldcontext); @@ -863,7 +887,9 @@ nextpage: * tuple. * * - That downlink to block was encountered in parent where that's expected. - * (Limited to heapallindexed readonly callers.) + * (Limited to readonly callers.) + * + * - That high keys of child pages matches corresponding pivot keys in parent. * * This is also where heapallindexed callers use their Bloom filter to * fingerprint IndexTuples for later table_index_build_scan() verification. @@ -981,22 +1007,26 @@ bt_target_page_check(BtreeCheckState *state) (uint32) state->targetlsn))); } - /* Fingerprint downlink blocks in heapallindexed + readonly case */ - if (state->heapallindexed && state->readonly && !P_ISLEAF(topaque)) - { - BlockNumber childblock = BTreeTupleGetDownLink(itup); - - bloom_add_element(state->downlinkfilter, - (unsigned char *) &childblock, - sizeof(BlockNumber)); - } - /* * Don't try to generate scankey using "negative infinity" item on * internal pages. They are always truncated to zero attributes. */ if (offset_is_negative_infinity(topaque, offset)) + { + /* + * We don't call bt_child_check() for "negative infinity" items. + * But if we're performing downlink connectivity check, we do it + * for every item including "negative infinity" one. + */ + if (!P_ISLEAF(topaque) && state->readonly) + { + bt_child_highkey_check(state, + offset, + NULL, + topaque->btpo.level); + } continue; + } /* * Readonly callers may optionally verify that non-pivot tuples can @@ -1340,20 +1370,23 @@ bt_target_page_check(BtreeCheckState *state) * because it has no useful value to compare). */ if (!P_ISLEAF(topaque) && state->readonly) - { - BlockNumber childblock = BTreeTupleGetDownLink(itup); - - bt_downlink_check(state, skey, childblock); - } + bt_child_check(state, skey, offset); } /* - * * Check if page has a downlink in parent * + * Special case bt_child_highkey_check() call * - * This can only be checked in heapallindexed + readonly case. + * We don't pass an real downlink, but we've to finish the level + * processing. If condition is satisfied, we've already processed all the + * downlinks from the target level. But there still might be pages to the + * right of the child page pointer to by our rightmost downlink. And they + * might have missing downlinks. This final call checks for them. */ - if (state->heapallindexed && state->readonly) - bt_downlink_missing_check(state); + if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly) + { + bt_child_highkey_check(state, InvalidOffsetNumber, + NULL, topaque->btpo.level); + } } /* @@ -1497,7 +1530,7 @@ bt_right_page_check_scankey(BtreeCheckState *state) * Top level tree walk caller moves on to next page (makes it the new * target) following recovery from this race. (cf. The rationale for * child/downlink verification needing a ShareLock within - * bt_downlink_check(), where page deletion is also the main source of + * bt_child_check(), where page deletion is also the main source of * trouble.) * * Note that it doesn't matter if right sibling page here is actually a @@ -1571,6 +1604,297 @@ bt_right_page_check_scankey(BtreeCheckState *state) } /* + * Check if two tuples are binary identical except the block number. So, + * this function is capable to compare pivot keys on different levels. + */ +static bool +bt_pivot_tuple_identical(IndexTuple itup1, IndexTuple itup2) +{ + if (IndexTupleSize(itup1) != IndexTupleSize(itup2)) + return false; + + if (memcmp(&itup1->t_tid.ip_posid, &itup2->t_tid.ip_posid, + IndexTupleSize(itup1) - offsetof(ItemPointerData, ip_posid)) != 0) + return false; + + return true; +} + +/*--- + * Check high keys on the child level. Traverse rightlinks from previous + * downlink to the current one. Check that there are no intermediate pages + * with missing downlinks. + * + * If 'loaded_child' is given, it's assumed to be the page pointed to by the + * downlink referenced by 'downlinkoffnum' of the target page. + * + * Basically this function is called for each target downlink and checks two + * invariants: + * + * 1) You can reach the next child from previous one via rightlinks; + * 2) Each child high key have matching pivot key on target level. + * + * Consider the sample tree picture. + * + * 1 + * / \ + * 2 <-> 3 + * / \ / \ + * 4 <> 5 <> 6 <> 7 <> 8 + * + * This function will be called for blocks 4, 5, 6 and 8. Consider what is + * happening for each function call. + * + * - The function call for block 4 initializes data structure and matches high + * key of block 4 to downlink's pivot key of block 2. + * - The high key of block 5 is matched to the high key of block 2. + * - The block 6 has an incomplete split flag set, so its high key isn't + * matched to anything. + * - The function call for block 8 checks that block 8 can be found while + * following rightlinks from block 6. The high key of block 7 will be + * matched to downlink's pivot key in block 3. + * + * There is also final call of this function, which checks that there is no + * missing downlinks for children to the right of the child referenced by + * rightmost downlink in target level. + */ +static void +bt_child_highkey_check(BtreeCheckState *state, + OffsetNumber target_downlinkoffnum, + Page loaded_child, + uint32 target_level) +{ + BlockNumber blkno = state->prevrightlink; + Page page; + BTPageOpaque opaque; + bool rightsplit = state->previncompletesplit; + bool first = true; + ItemId itemid; + IndexTuple itup; + BlockNumber downlink; + + if (OffsetNumberIsValid(target_downlinkoffnum)) + { + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, target_downlinkoffnum); + itup = (IndexTuple) PageGetItem(state->target, itemid); + downlink = BTreeTupleGetDownLink(itup); + } + else + { + downlink = P_NONE; + } + + /* + * If no previous rightlink is memorized for current level just below + * target page's level, we are about to start from the leftmost page. We + * can't follow rightlinks from previous page, because there is no + * previous page. But we still can match high key. + * + * So we initialize variables for the loop above like there is previous + * page referencing current child. Also we imply previous page to not + * have incomplete split flag, that would make us require downlink for + * current child. That's correct, because leftmost page on the level + * should always have parent downlink. + */ + if (!BlockNumberIsValid(blkno)) + { + blkno = downlink; + rightsplit = false; + } + + /* Move to the right on the child level */ + while (true) + { + /* + * Did we traverse the whole tree level and this is check for pages to + * the right of rightmost downlink? + */ + if (blkno == P_NONE && downlink == P_NONE) + { + state->prevrightlink = InvalidBlockNumber; + state->previncompletesplit = false; + return; + } + + /* Did we traverse the whole tree level and don't find next downlink? */ + if (blkno == P_NONE) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("can't traverse from downlink %u to downlink %u of index \"%s\"", + state->prevrightlink, downlink, + RelationGetRelationName(state->rel)))); + + /* Load page contents */ + if (blkno == downlink && loaded_child) + page = loaded_child; + else + page = palloc_btree_page(state, blkno); + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + /* The first page we visit at the level should be leftmost */ + if (first && !BlockNumberIsValid(state->prevrightlink) && !P_LEFTMOST(opaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + state->targetblock, blkno, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn))); + + /* Check level for non-ignorable page */ + if (!P_IGNORE(opaque) && opaque->btpo.level != target_level - 1) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("block found while following rightlinks from child of index \"%s\" has invalid level", + RelationGetRelationName(state->rel)), + errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.", + blkno, target_level - 1, opaque->btpo.level))); + + /* Try to detect circular links */ + if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("circular link chain found in block %u of index \"%s\"", + blkno, RelationGetRelationName(state->rel)))); + + if (blkno != downlink && !P_IGNORE(opaque)) + { + /* blkno probably has missing parent downlink */ + bt_downlink_missing_check(state, rightsplit, blkno, page); + } + + rightsplit = P_INCOMPLETE_SPLIT(opaque); + + /* + * If we visit page with high key, check that it is be equal to the + * target key next to corresponding downlink. + */ + if (!rightsplit && !P_RIGHTMOST(opaque)) + { + BTPageOpaque topaque; + IndexTuple highkey; + OffsetNumber pivotkey_offset; + + /* Get high key */ + itemid = PageGetItemIdCareful(state, blkno, page, P_HIKEY); + highkey = (IndexTuple) PageGetItem(page, itemid); + + /* + * There might be two situations when we examine high key. If + * current child page is referenced by given target downlink, we + * should look to the next offset number for matching key from + * target page. + * + * Alternatively, we're following rightlinks somewhere in the + * middle between page referenced by previous target's downlink + * and the page referenced by current target's downlink. If + * current child page hasn't incomplete split flag set, then its + * high key should match to the target's key of current offset + * number. This happens when a previous call here (to + * bt_child_highkey_check()) found an incomplete split, and we + * reach a right sibling page without a downlink -- the right + * sibling page's high key still needs to be matched to a + * separator key on the parent/target level. + * + * Don't apply OffsetNumberNext() to target_downlinkoffnum when we + * already had to step right on the child level. Our traversal of + * the child level must try to move in perfect lockstep behind (to + * the left of) the target/parent level traversal. + */ + if (blkno == downlink) + pivotkey_offset = OffsetNumberNext(target_downlinkoffnum); + else + pivotkey_offset = target_downlinkoffnum; + + topaque = (BTPageOpaque) PageGetSpecialPointer(state->target); + + if (!offset_is_negative_infinity(topaque, pivotkey_offset)) + { + /* + * If we're looking for the next pivot tuple in target page, + * but there is no more pivot tuples, then we should match to + * high key instead. + */ + if (pivotkey_offset > PageGetMaxOffsetNumber(state->target)) + { + if (P_RIGHTMOST(topaque)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("child high key is greater than rightmost pivot key on target level in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + state->targetblock, blkno, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn))); + pivotkey_offset = P_HIKEY; + } + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, pivotkey_offset); + itup = (IndexTuple) PageGetItem(state->target, itemid); + } + else + { + /* + * We cannot try to match child's high key to a negative + * infinity key in target, since there is nothing to compare. + * However, it's still possible to match child's high key + * outside of target page. The reason why we're are is that + * bt_child_highkey_check() was previously called for the + * cousin page of 'loaded_child', which is incomplete split. + * So, now we traverse to the right of that cousin page and + * current child level page under consideration still belongs + * to the subtree of target's left sibling. Thus, we need to + * match child's high key to it's left uncle page high key. + * Thankfully we saved it, it's called a "low key" of target + * page. + */ + if (!state->lowkey) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("can't find left sibling high key in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + state->targetblock, blkno, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn))); + itup = state->lowkey; + } + + if (!bt_pivot_tuple_identical(highkey, itup)) + { + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("mismatch between parent key and child high key in index \"%s\"", + RelationGetRelationName(state->rel)), + errdetail_internal("Target block=%u child block=%u target page lsn=%X/%X.", + state->targetblock, blkno, + (uint32) (state->targetlsn >> 32), + (uint32) state->targetlsn))); + } + } + + /* Exit if we already found next downlink */ + if (blkno == downlink) + { + state->prevrightlink = opaque->btpo_next; + state->previncompletesplit = rightsplit; + return; + } + + /* Traverse to the next page using rightlink */ + blkno = opaque->btpo_next; + + /* Free page contents if it's allocated by us */ + if (page != loaded_child) + pfree(page); + first = false; + } +} + +/* * Checks one of target's downlink against its child page. * * Conceptually, the target page continues to be what is checked here. The @@ -1578,15 +1902,28 @@ bt_right_page_check_scankey(BtreeCheckState *state) * The downlink insertion into the target is probably where any problem raised * here arises, and there is no such thing as a parent link, so doing the * verification this way around is much more practical. + * + * This function visits child page and it's sequentially called for each + * downlink of target page. Assuming this we also check downlink connectivity + * here in order to save child page visits. */ static void -bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, - BlockNumber childblock) +bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, + OffsetNumber downlinkoffnum) { + ItemId itemid; + IndexTuple itup; + BlockNumber childblock; OffsetNumber offset; OffsetNumber maxoffset; Page child; BTPageOpaque copaque; + BTPageOpaque topaque; + + itemid = PageGetItemIdCareful(state, state->targetblock, + state->target, downlinkoffnum); + itup = (IndexTuple) PageGetItem(state->target, itemid); + childblock = BTreeTupleGetDownLink(itup); /* * Caller must have ShareLock on target relation, because of @@ -1636,11 +1973,19 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, * Check all items, rather than checking just the first and trusting that * the operator class obeys the transitive law. */ + topaque = (BTPageOpaque) PageGetSpecialPointer(state->target); child = palloc_btree_page(state, childblock); copaque = (BTPageOpaque) PageGetSpecialPointer(child); maxoffset = PageGetMaxOffsetNumber(child); /* + * Since we've already loaded the child block, combine this check with + * check for downlink connectivity. + */ + bt_child_highkey_check(state, downlinkoffnum, + child, topaque->btpo.level); + + /* * Since there cannot be a concurrent VACUUM operation in readonly mode, * and since a page has no links within other pages (siblings and parent) * once it is marked fully deleted, it should be impossible to land on a @@ -1734,23 +2079,27 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey, * be concerned about concurrent page splits or page deletions. */ static void -bt_downlink_missing_check(BtreeCheckState *state) +bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, + BlockNumber blkno, Page page) { - BTPageOpaque topaque = (BTPageOpaque) PageGetSpecialPointer(state->target); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); ItemId itemid; IndexTuple itup; Page child; BTPageOpaque copaque; uint32 level; BlockNumber childblk; + XLogRecPtr pagelsn; - Assert(state->heapallindexed && state->readonly); - Assert(!P_IGNORE(topaque)); + Assert(state->readonly); + Assert(!P_IGNORE(opaque)); /* No next level up with downlinks to fingerprint from the true root */ - if (P_ISROOT(topaque)) + if (P_ISROOT(opaque)) return; + pagelsn = PageGetLSN(page); + /* * Incomplete (interrupted) page splits can account for the lack of a * downlink. Some inserting transaction should eventually complete the @@ -1772,54 +2121,47 @@ bt_downlink_missing_check(BtreeCheckState *state) * by design, so it shouldn't be taken to indicate corruption. See * _bt_pagedel() for full details. */ - if (state->rightsplit) + if (rightsplit) { ereport(DEBUG1, (errcode(ERRCODE_NO_DATA), errmsg("harmless interrupted page split detected in index %s", RelationGetRelationName(state->rel)), errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.", - state->targetblock, topaque->btpo.level, - topaque->btpo_prev, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + blkno, opaque->btpo.level, + opaque->btpo_prev, + (uint32) (pagelsn >> 32), + (uint32) pagelsn))); return; } - /* Target's downlink is typically present in parent/fingerprinted */ - if (!bloom_lacks_element(state->downlinkfilter, - (unsigned char *) &state->targetblock, - sizeof(BlockNumber))) - return; - /* - * Target is probably the "top parent" of a multi-level page deletion. - * We'll need to descend the subtree to make sure that descendant pages - * are consistent with that, though. + * Page under check is probably the "top parent" of a multi-level page + * deletion. We'll need to descend the subtree to make sure that + * descendant pages are consistent with that, though. * - * If the target page (which must be non-ignorable) is a leaf page, then - * clearly it can't be the top parent. The lack of a downlink is probably - * a symptom of a broad problem that could just as easily cause + * If the page (which must be non-ignorable) is a leaf page, then clearly + * it can't be the top parent. The lack of a downlink is probably a + * symptom of a broad problem that could just as easily cause * inconsistencies anywhere else. */ - if (P_ISLEAF(topaque)) + if (P_ISLEAF(opaque)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("leaf index block lacks downlink in index \"%s\"", RelationGetRelationName(state->rel)), errdetail_internal("Block=%u page lsn=%X/%X.", - state->targetblock, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + blkno, + (uint32) (pagelsn >> 32), + (uint32) pagelsn))); - /* Descend from the target page, which is an internal page */ + /* Descend from the given page, which is an internal page */ elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"", RelationGetRelationName(state->rel)); - level = topaque->btpo.level; - itemid = PageGetItemIdCareful(state, state->targetblock, state->target, - P_FIRSTDATAKEY(topaque)); - itup = (IndexTuple) PageGetItem(state->target, itemid); + level = opaque->btpo.level; + itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque)); + itup = (IndexTuple) PageGetItem(page, itemid); childblk = BTreeTupleGetDownLink(itup); for (;;) { @@ -1837,8 +2179,8 @@ bt_downlink_missing_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down", RelationGetRelationName(state->rel)), - errdetail_internal("Top parent/target block=%u block pointed to=%u expected level=%u level in pointed to block=%u.", - state->targetblock, childblk, + errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.", + blkno, childblk, level - 1, copaque->btpo.level))); level = copaque->btpo.level; @@ -1854,12 +2196,12 @@ bt_downlink_missing_check(BtreeCheckState *state) * Since there cannot be a concurrent VACUUM operation in readonly mode, * and since a page has no links within other pages (siblings and parent) * once it is marked fully deleted, it should be impossible to land on a - * fully deleted page. See bt_downlink_check() for further details. + * fully deleted page. See bt_child_check() for further details. * - * The bt_downlink_check() P_ISDELETED() check is repeated here because - * bt_downlink_check() does not visit pages reachable through negative - * infinity items. Besides, bt_downlink_check() is unwilling to descend - * multiple levels. (The similar bt_downlink_check() P_ISDELETED() check + * The bt_child_check() P_ISDELETED() check is repeated here because + * bt_child_check() does not visit pages reachable through negative + * infinity items. Besides, bt_child_check() is unwilling to descend + * multiple levels. (The similar bt_child_check() P_ISDELETED() check * within bt_check_level_from_leftmost() won't reach the page either, * since the leaf's live siblings should have their sibling links updated * to bypass the deletion target page when it is marked fully dead.) @@ -1875,26 +2217,26 @@ bt_downlink_missing_check(BtreeCheckState *state) (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("downlink to deleted leaf page found in index \"%s\"", RelationGetRelationName(state->rel)), - errdetail_internal("Top parent/target block=%u leaf block=%u top parent/target lsn=%X/%X.", - state->targetblock, childblk, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + errdetail_internal("Top parent/target block=%u leaf block=%u top parent/under check lsn=%X/%X.", + blkno, childblk, + (uint32) (pagelsn >> 32), + (uint32) pagelsn))); /* * Iff leaf page is half-dead, its high key top parent link should point * to what VACUUM considered to be the top parent page at the instant it * was interrupted. Provided the high key link actually points to the - * target page, the missing downlink we detected is consistent with there - * having been an interrupted multi-level page deletion. This means that - * the subtree with the target page at its root (a page deletion chain) is - * in a consistent state, enabling VACUUM to resume deleting the entire - * chain the next time it encounters the half-dead leaf page. + * page under check, the missing downlink we detected is consistent with + * there having been an interrupted multi-level page deletion. This means + * that the subtree with the page under check at its root (a page deletion + * chain) is in a consistent state, enabling VACUUM to resume deleting the + * entire chain the next time it encounters the half-dead leaf page. */ if (P_ISHALFDEAD(copaque) && !P_RIGHTMOST(copaque)) { itemid = PageGetItemIdCareful(state, childblk, child, P_HIKEY); itup = (IndexTuple) PageGetItem(child, itemid); - if (BTreeTupleGetTopParent(itup) == state->targetblock) + if (BTreeTupleGetTopParent(itup) == blkno) return; } @@ -1903,9 +2245,9 @@ bt_downlink_missing_check(BtreeCheckState *state) errmsg("internal index block lacks downlink in index \"%s\"", RelationGetRelationName(state->rel)), errdetail_internal("Block=%u level=%u page lsn=%X/%X.", - state->targetblock, topaque->btpo.level, - (uint32) (state->targetlsn >> 32), - (uint32) state->targetlsn))); + blkno, opaque->btpo.level, + (uint32) (pagelsn >> 32), + (uint32) pagelsn))); } /* @@ -2146,7 +2488,7 @@ bt_posting_plain_tuple(IndexTuple itup, int n) * separator key value won't always be available from parent, though, because * the first items of internal pages are negative infinity items, truncated * down to zero attributes during internal page splits. While it's true that - * bt_downlink_check() and the high key check can detect most imaginable key + * bt_child_check() and the high key check can detect most imaginable key * space problems, there are remaining problems it won't detect with non-pivot * tuples in cousin leaf pages. Starting a search from the root for every * existing leaf tuple detects small inconsistencies in upper levels of the diff --git a/doc/src/sgml/amcheck.sgml b/doc/src/sgml/amcheck.sgml index fe0fe9c186e..c912d2aa895 100644 --- a/doc/src/sgml/amcheck.sgml +++ b/doc/src/sgml/amcheck.sgml @@ -125,8 +125,7 @@ ORDER BY c.relpages DESC LIMIT 10; Optionally, when the <parameter>heapallindexed</parameter> argument is <literal>true</literal>, the function verifies the presence of all heap tuples that should be found within the - index, and that there are no missing downlinks in the index - structure. When the optional <parameter>rootdescend</parameter> + index. When the optional <parameter>rootdescend</parameter> argument is <literal>true</literal>, verification re-finds tuples on the leaf level by performing a new search from the root page for each tuple. The checks that can be performed by @@ -136,7 +135,8 @@ ORDER BY c.relpages DESC LIMIT 10; a more thorough variant of <function>bt_index_check</function>: unlike <function>bt_index_check</function>, <function>bt_index_parent_check</function> also checks - invariants that span parent/child relationships. + invariants that span parent/child relationships, including checking + that there are no missing downlinks in the index structure. <function>bt_index_parent_check</function> follows the general convention of raising an error if it finds a logical inconsistency or other problem. |