diff options
Diffstat (limited to 'src/backend/access/nbtree/nbtsearch.c')
-rw-r--r-- | src/backend/access/nbtree/nbtsearch.c | 220 |
1 files changed, 175 insertions, 45 deletions
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 0daae3cd586..91089d85454 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1,14 +1,14 @@ /*------------------------------------------------------------------------- * * nbtsearch.c - * search code for postgres btrees. + * Search code for postgres btrees. * * * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.73 2003/02/21 00:06:21 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.74 2003/02/22 00:45:04 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -19,6 +19,7 @@ #include "access/nbtree.h" +static Buffer _bt_walk_left(Relation rel, Buffer buf); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); @@ -79,10 +80,11 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, par_blkno = BufferGetBlockNumber(*bufP); /* - * We need to save the bit image of the index entry we chose in + * We need to save the location of the index entry we chose in * the parent page on a stack. In case we split the tree, we'll - * use this bit image to figure out what our real parent page is, - * in case the parent splits while we're working lower in the + * use the stack to work back up to the parent page. We also save + * the actual downlink (TID) to uniquely identify the index entry, + * in case it moves right while we're working lower in the * tree. See the paper by Lehman and Yao for how this is detected * and handled. (We use the child link to disambiguate duplicate * keys in the index -- Lehman and Yao disallow duplicate keys.) @@ -114,7 +116,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, /* * _bt_moveright() -- move right in the btree if necessary. * - * When we drop and reacquire a pointer to a page, it is possible that + * When we follow a pointer to reach a page, it is possible that * the page has changed in the meanwhile. If this happens, we're * guaranteed that the page has "split right" -- that is, that any * data that appeared on the page originally is either on the page @@ -148,9 +150,13 @@ _bt_moveright(Relation rel, * right. (If the scan key is equal to the high key, we might or * might not need to move right; have to scan the page first anyway.) * It could even have split more than once, so scan as far as needed. + * + * We also have to move right if we followed a link that brought us to + * a dead page. */ while (!P_RIGHTMOST(opaque) && - _bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0) + (P_IGNORE(opaque) || + _bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0)) { /* step right one page */ BlockNumber rblkno = opaque->btpo_next; @@ -161,6 +167,10 @@ _bt_moveright(Relation rel, opaque = (BTPageOpaque) PageGetSpecialPointer(page); } + if (P_IGNORE(opaque)) + elog(ERROR, "_bt_moveright: fell off the end of %s", + RelationGetRelationName(rel)); + return buf; } @@ -796,7 +806,6 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) OffsetNumber offnum, maxoff; BlockNumber blkno; - BlockNumber obknum; /* * Don't use ItemPointerGetOffsetNumber or you risk to get assertion @@ -814,7 +823,7 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) offnum = OffsetNumberNext(offnum); else { - /* walk right to the next page with data */ + /* Walk right to the next page with data */ for (;;) { /* if we're at end of scan, release the buffer and return */ @@ -831,58 +840,56 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) *bufP = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); - maxoff = PageGetMaxOffsetNumber(page); - /* done if it's not empty */ - offnum = P_FIRSTDATAKEY(opaque); - if (!PageIsEmpty(page) && offnum <= maxoff) - break; + if (!P_IGNORE(opaque)) + { + maxoff = PageGetMaxOffsetNumber(page); + /* done if it's not empty */ + offnum = P_FIRSTDATAKEY(opaque); + if (!PageIsEmpty(page) && offnum <= maxoff) + break; + } } } } - else + else /* backwards scan */ { if (offnum > P_FIRSTDATAKEY(opaque)) offnum = OffsetNumberPrev(offnum); else { - /* walk left to the next page with data */ + /* + * Walk left to the next page with data. This is much more + * complex than the walk-right case because of the possibility + * that the page to our left splits while we are in flight to it, + * plus the possibility that the page we were on gets deleted + * after we leave it. See nbtree/README for details. + */ for (;;) { - /* if we're at end of scan, release the buffer and return */ - if (P_LEFTMOST(opaque)) + *bufP = _bt_walk_left(rel, *bufP); + + /* if we're at end of scan, return failure */ + if (*bufP == InvalidBuffer) { - _bt_relbuf(rel, *bufP); ItemPointerSetInvalid(current); - *bufP = so->btso_curbuf = InvalidBuffer; + so->btso_curbuf = InvalidBuffer; return false; } - /* step left */ - obknum = BufferGetBlockNumber(*bufP); - blkno = opaque->btpo_prev; - _bt_relbuf(rel, *bufP); - *bufP = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); - /* - * If the adjacent page just split, then we have to walk - * right to find the block that's now adjacent to where we - * were. Because pages only split right, we don't have to - * worry about this failing to terminate. + * Okay, we managed to move left to a non-deleted page. + * Done if it's not half-dead and not empty. Else loop back + * and do it all again. */ - while (opaque->btpo_next != obknum) + if (!P_IGNORE(opaque)) { - blkno = opaque->btpo_next; - _bt_relbuf(rel, *bufP); - *bufP = _bt_getbuf(rel, blkno, BT_READ); - page = BufferGetPage(*bufP); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + offnum = maxoff; + if (!PageIsEmpty(page) && + maxoff >= P_FIRSTDATAKEY(opaque)) + break; } - /* done if it's not empty */ - maxoff = PageGetMaxOffsetNumber(page); - offnum = maxoff; - if (!PageIsEmpty(page) && maxoff >= P_FIRSTDATAKEY(opaque)) - break; } } } @@ -896,10 +903,132 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) } /* + * _bt_walk_left() -- step left one page, if possible + * + * The given buffer must be pinned and read-locked. This will be dropped + * before stepping left. On return, we have pin and read lock on the + * returned page, instead. + * + * Returns InvalidBuffer if there is no page to the left (no lock is held + * in that case). + * + * When working on a non-leaf level, it is possible for the returned page + * to be half-dead; the caller should check that condition and step left + * again if it's important. + */ +static Buffer +_bt_walk_left(Relation rel, Buffer buf) +{ + Page page; + BTPageOpaque opaque; + + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + for (;;) + { + BlockNumber obknum; + BlockNumber lblkno; + BlockNumber blkno; + int tries; + + /* if we're at end of tree, release buf and return failure */ + if (P_LEFTMOST(opaque)) + { + _bt_relbuf(rel, buf); + break; + } + /* remember original page we are stepping left from */ + obknum = BufferGetBlockNumber(buf); + /* step left */ + blkno = lblkno = opaque->btpo_prev; + _bt_relbuf(rel, buf); + buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + /* + * If this isn't the page we want, walk right till we find + * what we want --- but go no more than four hops (an + * arbitrary limit). If we don't find the correct page by then, + * the most likely bet is that the original page got deleted + * and isn't in the sibling chain at all anymore, not that its + * left sibling got split more than four times. + * + * Note that it is correct to test P_ISDELETED not P_IGNORE + * here, because half-dead pages are still in the sibling + * chain. Caller must reject half-dead pages if wanted. + */ + tries = 0; + for (;;) + { + if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum) + { + /* Found desired page, return it */ + return buf; + } + if (P_RIGHTMOST(opaque) || ++tries > 4) + break; + blkno = opaque->btpo_next; + _bt_relbuf(rel, buf); + buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + } + + /* Return to the original page to see what's up */ + _bt_relbuf(rel, buf); + buf = _bt_getbuf(rel, obknum, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (P_ISDELETED(opaque)) + { + /* + * It was deleted. Move right to first nondeleted page + * (there must be one); that is the page that has acquired the + * deleted one's keyspace, so stepping left from it will take + * us where we want to be. + */ + for (;;) + { + if (P_RIGHTMOST(opaque)) + elog(ERROR, "_bt_walk_left: fell off the end of %s", + RelationGetRelationName(rel)); + blkno = opaque->btpo_next; + _bt_relbuf(rel, buf); + buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!P_ISDELETED(opaque)) + break; + } + /* + * Now return to top of loop, resetting obknum to + * point to this nondeleted page, and try again. + */ + } + else + { + /* + * It wasn't deleted; the explanation had better be + * that the page to the left got split or deleted. + * Without this check, we'd go into an infinite loop + * if there's anything wrong. + */ + if (opaque->btpo_prev == lblkno) + elog(ERROR, "_bt_walk_left: can't find left sibling in %s", + RelationGetRelationName(rel)); + /* Okay to try again with new lblkno value */ + } + } + + return InvalidBuffer; +} + +/* * _bt_get_endpoint() -- Find the first or last page on a given tree level * * If the index is empty, we will return InvalidBuffer; any other failure - * condition causes elog(). + * condition causes elog(). We will not return a dead page. * * The returned buffer is pinned and read-locked. */ @@ -941,12 +1070,13 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) * step right if needed to get to it (this could happen if the * page split since we obtained a pointer to it). */ - while (P_ISDELETED(opaque) || + while (P_IGNORE(opaque) || (rightmost && !P_RIGHTMOST(opaque))) { blkno = opaque->btpo_next; if (blkno == P_NONE) - elog(ERROR, "_bt_get_endpoint: ran off end of btree"); + elog(ERROR, "_bt_get_endpoint: fell off the end of %s", + RelationGetRelationName(rel)); _bt_relbuf(rel, buf); buf = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(buf); @@ -959,7 +1089,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) if (opaque->btpo.level < level) elog(ERROR, "_bt_get_endpoint: btree level %u not found", level); - /* Step to leftmost or rightmost child page */ + /* Descend to leftmost or rightmost child page */ if (rightmost) offnum = PageGetMaxOffsetNumber(page); else |