diff options
Diffstat (limited to 'src/backend/access/nbtree/nbtsearch.c')
-rw-r--r-- | src/backend/access/nbtree/nbtsearch.c | 1327 |
1 files changed, 422 insertions, 905 deletions
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 54c15b2f6a9..49aec3b23d9 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1,14 +1,14 @@ /*------------------------------------------------------------------------- * - * btsearch.c + * nbtsearch.c * search code for postgres btrees. * + * * Portions Copyright (c) 1996-2000, PostgreSQL, Inc * Portions Copyright (c) 1994, Regents of the University of California * - * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.60 2000/05/30 04:24:33 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.61 2000/07/21 06:42:32 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -19,102 +19,96 @@ #include "access/nbtree.h" +static RetrieveIndexResult _bt_endpoint(IndexScanDesc scan, ScanDirection dir); -static BTStack _bt_searchr(Relation rel, int keysz, ScanKey scankey, - Buffer *bufP, BTStack stack_in); -static int32 _bt_compare(Relation rel, TupleDesc itupdesc, Page page, - int keysz, ScanKey scankey, OffsetNumber offnum); -static bool - _bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir); -static RetrieveIndexResult - _bt_endpoint(IndexScanDesc scan, ScanDirection dir); /* - * _bt_search() -- Search for a scan key in the index. + * _bt_search() -- Search the tree for a particular scankey, + * or more precisely for the first leaf page it could be on. + * + * Return value is a stack of parent-page pointers. *bufP is set to the + * address of the leaf-page buffer, which is read-locked and pinned. + * No locks are held on the parent pages, however! * - * This routine is actually just a helper that sets things up and - * calls a recursive-descent search routine on the tree. + * NOTE that the returned buffer is read-locked regardless of the access + * parameter. However, access = BT_WRITE will allow an empty root page + * to be created and returned. When access = BT_READ, an empty index + * will result in *bufP being set to InvalidBuffer. */ BTStack -_bt_search(Relation rel, int keysz, ScanKey scankey, Buffer *bufP) -{ - *bufP = _bt_getroot(rel, BT_READ); - return _bt_searchr(rel, keysz, scankey, bufP, (BTStack) NULL); -} - -/* - * _bt_searchr() -- Search the tree recursively for a particular scankey. - */ -static BTStack -_bt_searchr(Relation rel, - int keysz, - ScanKey scankey, - Buffer *bufP, - BTStack stack_in) +_bt_search(Relation rel, int keysz, ScanKey scankey, + Buffer *bufP, int access) { - BTStack stack; - OffsetNumber offnum; - Page page; - BTPageOpaque opaque; - BlockNumber par_blkno; - BlockNumber blkno; - ItemId itemid; - BTItem btitem; - BTItem item_save; - int item_nbytes; - IndexTuple itup; + BTStack stack_in = NULL; - /* if this is a leaf page, we're done */ - page = BufferGetPage(*bufP); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - if (opaque->btpo_flags & BTP_LEAF) - return stack_in; + /* Get the root page to start with */ + *bufP = _bt_getroot(rel, access); - /* - * Find the appropriate item on the internal page, and get the child - * page that it points to. - */ + /* If index is empty and access = BT_READ, no root page is created. */ + if (! BufferIsValid(*bufP)) + return (BTStack) NULL; - par_blkno = BufferGetBlockNumber(*bufP); - offnum = _bt_binsrch(rel, *bufP, keysz, scankey, BT_DESCENT); - itemid = PageGetItemId(page, offnum); - btitem = (BTItem) PageGetItem(page, itemid); - itup = &(btitem->bti_itup); - blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); + /* Loop iterates once per level descended in the tree */ + for (;;) + { + Page page; + BTPageOpaque opaque; + OffsetNumber offnum; + ItemId itemid; + BTItem btitem; + IndexTuple itup; + BlockNumber blkno; + BlockNumber par_blkno; + BTStack new_stack; + + /* if this is a leaf page, we're done */ + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (P_ISLEAF(opaque)) + break; - /* - * We need to save the bit image of the index entry we chose in the - * parent page on a stack. In case we split the tree, we'll use this - * bit image to figure out what our real parent page is, in case the - * parent splits while we're working lower in the tree. See the paper - * by Lehman and Yao for how this is detected and handled. (We use - * unique OIDs to disambiguate duplicate keys in the index -- Lehman - * and Yao disallow duplicate keys). - */ + /* + * Find the appropriate item on the internal page, and get the + * child page that it points to. + */ + offnum = _bt_binsrch(rel, *bufP, keysz, scankey); + itemid = PageGetItemId(page, offnum); + btitem = (BTItem) PageGetItem(page, itemid); + itup = &(btitem->bti_itup); + blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); + par_blkno = BufferGetBlockNumber(*bufP); - item_nbytes = ItemIdGetLength(itemid); - item_save = (BTItem) palloc(item_nbytes); - memmove((char *) item_save, (char *) btitem, item_nbytes); - stack = (BTStack) palloc(sizeof(BTStackData)); - stack->bts_blkno = par_blkno; - stack->bts_offset = offnum; - stack->bts_btitem = item_save; - stack->bts_parent = stack_in; + /* + * We need to save the bit image of the index entry we chose in the + * parent page on a stack. In case we split the tree, we'll use this + * bit image to figure out what our real parent page is, in case the + * parent splits while we're working lower in the tree. See the paper + * by Lehman and Yao for how this is detected and handled. (We use the + * child link to disambiguate duplicate keys in the index -- Lehman + * and Yao disallow duplicate keys.) + */ + new_stack = (BTStack) palloc(sizeof(BTStackData)); + new_stack->bts_blkno = par_blkno; + new_stack->bts_offset = offnum; + memcpy(&new_stack->bts_btitem, btitem, sizeof(BTItemData)); + new_stack->bts_parent = stack_in; - /* drop the read lock on the parent page and acquire one on the child */ - _bt_relbuf(rel, *bufP, BT_READ); - *bufP = _bt_getbuf(rel, blkno, BT_READ); + /* drop the read lock on the parent page, acquire one on the child */ + _bt_relbuf(rel, *bufP, BT_READ); + *bufP = _bt_getbuf(rel, blkno, BT_READ); - /* - * Race -- the page we just grabbed may have split since we read its - * pointer in the parent. If it has, we may need to move right to its - * new sibling. Do that. - */ + /* + * Race -- the page we just grabbed may have split since we read its + * pointer in the parent. If it has, we may need to move right to its + * new sibling. Do that. + */ + *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ); - *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ); + /* okay, all set to move down a level */ + stack_in = new_stack; + } - /* okay, all set to move down a level */ - return _bt_searchr(rel, keysz, scankey, bufP, stack); + return stack_in; } /* @@ -133,7 +127,7 @@ _bt_searchr(Relation rel, * * On entry, we have the buffer pinned and a lock of the proper type. * If we move right, we release the buffer and lock and acquire the - * same on the right sibling. + * same on the right sibling. Return value is the buffer we stop at. */ Buffer _bt_moveright(Relation rel, @@ -144,231 +138,81 @@ _bt_moveright(Relation rel, { Page page; BTPageOpaque opaque; - ItemId hikey; - BlockNumber rblkno; - int natts = rel->rd_rel->relnatts; page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); - /* if we're on a rightmost page, we don't need to move right */ - if (P_RIGHTMOST(opaque)) - return buf; - - /* by convention, item 0 on non-rightmost pages is the high key */ - hikey = PageGetItemId(page, P_HIKEY); - /* - * If the scan key that brought us to this page is >= the high key + * If the scan key that brought us to this page is > the high key * stored on the page, then the page has split and we need to move - * right. + * right. (If the scan key is equal to the high key, we might or + * might not need to move right; have to scan the page first anyway.) + * It could even have split more than once, so scan as far as needed. */ - - if (_bt_skeycmp(rel, keysz, scankey, page, hikey, - BTGreaterEqualStrategyNumber)) + while (!P_RIGHTMOST(opaque) && + _bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0) { - /* move right as long as we need to */ - do - { - OffsetNumber offmax = PageGetMaxOffsetNumber(page); - - /* - * If this page consists of all duplicate keys (hikey and - * first key on the page have the same value), then we don't - * need to step right. - * - * NOTE for multi-column indices: we may do scan using keys not - * for all attrs. But we handle duplicates using all attrs in - * _bt_insert/_bt_spool code. And so we've to compare scankey - * with _last_ item on this page to do not lose "good" tuples - * if number of attrs > keysize. Example: (2,0) - last items - * on this page, (2,1) - first item on next page (hikey), our - * scankey is x = 2. Scankey == (2,1) because of we compare - * first attrs only, but we shouldn't to move right of here. - - * vadim 04/15/97 - * - * Also, if this page is not LEAF one (and # of attrs > keysize) - * then we can't move too. - vadim 10/22/97 - */ - - if (_bt_skeycmp(rel, keysz, scankey, page, hikey, - BTEqualStrategyNumber)) - { - if (opaque->btpo_flags & BTP_CHAIN) - { - Assert((opaque->btpo_flags & BTP_LEAF) || offmax > P_HIKEY); - break; - } - if (offmax > P_HIKEY) - { - if (natts == keysz) /* sanity checks */ - { - if (_bt_skeycmp(rel, keysz, scankey, page, - PageGetItemId(page, P_FIRSTKEY), - BTEqualStrategyNumber)) - elog(FATAL, "btree: BTP_CHAIN flag was expected in %s (access = %s)", - RelationGetRelationName(rel), access ? "bt_write" : "bt_read"); - if (_bt_skeycmp(rel, keysz, scankey, page, - PageGetItemId(page, offmax), - BTEqualStrategyNumber)) - elog(FATAL, "btree: unexpected equal last item"); - if (_bt_skeycmp(rel, keysz, scankey, page, - PageGetItemId(page, offmax), - BTLessStrategyNumber)) - elog(FATAL, "btree: unexpected greater last item"); - /* move right */ - } - else if (!(opaque->btpo_flags & BTP_LEAF)) - break; - else if (_bt_skeycmp(rel, keysz, scankey, page, - PageGetItemId(page, offmax), - BTLessEqualStrategyNumber)) - break; - } - } + /* step right one page */ + BlockNumber rblkno = opaque->btpo_next; - /* step right one page */ - rblkno = opaque->btpo_next; - _bt_relbuf(rel, buf, access); - buf = _bt_getbuf(rel, rblkno, access); - page = BufferGetPage(buf); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - hikey = PageGetItemId(page, P_HIKEY); - - } while (!P_RIGHTMOST(opaque) - && _bt_skeycmp(rel, keysz, scankey, page, hikey, - BTGreaterEqualStrategyNumber)); + _bt_relbuf(rel, buf, access); + buf = _bt_getbuf(rel, rblkno, access); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); } + return buf; } /* - * _bt_skeycmp() -- compare a scan key to a particular item on a page using - * a requested strategy (<, <=, =, >=, >). + * _bt_binsrch() -- Do a binary search for a key on a particular page. * - * We ignore the unique OIDs stored in the btree item here. Those - * numbers are intended for use internally only, in repositioning a - * scan after a page split. They do not impose any meaningful ordering. + * The scankey we get has the compare function stored in the procedure + * entry of each data struct. We invoke this regproc to do the + * comparison for every key in the scankey. * - * The comparison is A <op> B, where A is the scan key and B is the - * tuple pointed at by itemid on page. - */ -bool -_bt_skeycmp(Relation rel, - Size keysz, - ScanKey scankey, - Page page, - ItemId itemid, - StrategyNumber strat) -{ - BTItem item; - IndexTuple indexTuple; - TupleDesc tupDes; - int i; - int32 compare = 0; - - item = (BTItem) PageGetItem(page, itemid); - indexTuple = &(item->bti_itup); - - tupDes = RelationGetDescr(rel); - - for (i = 1; i <= (int) keysz; i++) - { - ScanKey entry = &scankey[i - 1]; - Datum attrDatum; - bool isNull; - - Assert(entry->sk_attno == i); - attrDatum = index_getattr(indexTuple, - entry->sk_attno, - tupDes, - &isNull); - - /* see comments about NULLs handling in btbuild */ - if (entry->sk_flags & SK_ISNULL) /* key is NULL */ - { - if (isNull) - compare = 0; /* NULL key "=" NULL datum */ - else - compare = 1; /* NULL key ">" not-NULL datum */ - } - else if (isNull) /* key is NOT_NULL and item is NULL */ - { - compare = -1; /* not-NULL key "<" NULL datum */ - } - else - compare = DatumGetInt32(FunctionCall2(&entry->sk_func, - entry->sk_argument, - attrDatum)); - - if (compare != 0) - break; /* done when we find unequal attributes */ - } - - switch (strat) - { - case BTLessStrategyNumber: - return (bool) (compare < 0); - case BTLessEqualStrategyNumber: - return (bool) (compare <= 0); - case BTEqualStrategyNumber: - return (bool) (compare == 0); - case BTGreaterEqualStrategyNumber: - return (bool) (compare >= 0); - case BTGreaterStrategyNumber: - return (bool) (compare > 0); - } - - elog(ERROR, "_bt_skeycmp: bogus strategy %d", (int) strat); - return false; -} - -/* - * _bt_binsrch() -- Do a binary search for a key on a particular page. + * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first + * key >= given scankey. (NOTE: in particular, this means it is possible + * to return a value 1 greater than the number of keys on the page, + * if the scankey is > all keys on the page.) * - * The scankey we get has the compare function stored in the procedure - * entry of each data struct. We invoke this regproc to do the - * comparison for every key in the scankey. _bt_binsrch() returns - * the OffsetNumber of the first matching key on the page, or the - * OffsetNumber at which the matching key would appear if it were - * on this page. (NOTE: in particular, this means it is possible to - * return a value 1 greater than the number of keys on the page, if - * the scankey is > all keys on the page.) + * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber + * of the last key < given scankey. (Since _bt_compare treats the first + * data key of such a page as minus infinity, there will be at least one + * key < scankey, so the result always points at one of the keys on the + * page.) This key indicates the right place to descend to be sure we + * find all leaf keys >= given scankey. * - * By the time this procedure is called, we're sure we're looking - * at the right page -- don't need to walk right. _bt_binsrch() has - * no lock or refcount side effects on the buffer. + * This procedure is not responsible for walking right, it just examines + * the given page. _bt_binsrch() has no lock or refcount side effects + * on the buffer. */ OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, - ScanKey scankey, - int srchtype) + ScanKey scankey) { TupleDesc itupdesc; Page page; BTPageOpaque opaque; OffsetNumber low, high; - bool haveEq; - int natts = rel->rd_rel->relnatts; int32 result; itupdesc = RelationGetDescr(rel); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); - /* by convention, item 1 on any non-rightmost page is the high key */ - low = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - + low = P_FIRSTDATAKEY(opaque); high = PageGetMaxOffsetNumber(page); /* * If there are no keys on the page, return the first available slot. * Note this covers two cases: the page is really empty (no keys), or * it contains only a high key. The latter case is possible after - * vacuuming. + * vacuuming. This can never happen on an internal page, however, + * since they are never empty (an internal page must have children). */ if (high < low) return low; @@ -376,11 +220,9 @@ _bt_binsrch(Relation rel, /* * Binary search to find the first key on the page >= scan key. Loop * invariant: all slots before 'low' are < scan key, all slots at or - * after 'high' are >= scan key. Also, haveEq is true if the tuple at - * 'high' is == scan key. We can fall out when high == low. + * after 'high' are >= scan key. We can fall out when high == low. */ high++; /* establish the loop invariant for high */ - haveEq = false; while (high > low) { @@ -388,175 +230,77 @@ _bt_binsrch(Relation rel, /* We have low <= mid < high, so mid points at a real slot */ - result = _bt_compare(rel, itupdesc, page, keysz, scankey, mid); + result = _bt_compare(rel, keysz, scankey, page, mid); if (result > 0) low = mid + 1; else - { high = mid; - haveEq = (result == 0); - } } /*-------------------- * At this point we have high == low, but be careful: they could point - * past the last slot on the page. We also know that haveEq is true - * if and only if there is an equal key (in which case high&low point - * at the first equal key). + * past the last slot on the page. * * On a leaf page, we always return the first key >= scan key * (which could be the last slot + 1). *-------------------- */ - - if (opaque->btpo_flags & BTP_LEAF) + if (P_ISLEAF(opaque)) return low; /*-------------------- - * On a non-leaf page, there are special cases: - * - * For an insertion (srchtype != BT_DESCENT and natts == keysz) - * always return first key >= scan key (which could be off the end). - * - * For a standard search (srchtype == BT_DESCENT and natts == keysz) - * return the first equal key if one exists, else the last lesser key - * if one exists, else the first slot on the page. - * - * For a partial-match search (srchtype == BT_DESCENT and natts > keysz) - * return the last lesser key if one exists, else the first slot. - * - * Old comments: - * For multi-column indices, we may scan using keys - * not for all attrs. But we handle duplicates using all attrs - * in _bt_insert/_bt_spool code. And so while searching on - * internal pages having number of attrs > keysize we want to - * point at the last item < the scankey, not at the first item - * = the scankey (!!!), and let _bt_moveright decide later - * whether to move right or not (see comments and example - * there). Note also that INSERTions are not affected by this - * code (since natts == keysz for inserts). - vadim 04/15/97 + * On a non-leaf page, return the last key < scan key. + * There must be one if _bt_compare() is playing by the rules. *-------------------- */ - - if (haveEq) - { - - /* - * There is an equal key. We return either the first equal key - * (which we just found), or the last lesser key. - * - * We need not check srchtype != BT_DESCENT here, since if that is - * true then natts == keysz by assumption. - */ - if (natts == keysz) - return low; /* return first equal key */ - } - else - { - - /* - * There is no equal key. We return either the first greater key - * (which we just found), or the last lesser key. - */ - if (srchtype != BT_DESCENT) - return low; /* return first greater key */ - } - - - if (low == (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)) - return low; /* there is no prior item */ + Assert(low > P_FIRSTDATAKEY(opaque)); return OffsetNumberPrev(low); } -/* +/*---------- * _bt_compare() -- Compare scankey to a particular tuple on the page. * + * keysz: number of key conditions to be checked (might be less than the + * total length of the scan key!) + * page/offnum: location of btree item to be compared to. + * * This routine returns: * <0 if scankey < tuple at offnum; * 0 if scankey == tuple at offnum; * >0 if scankey > tuple at offnum. + * NULLs in the keys are treated as sortable values. Therefore + * "equality" does not necessarily mean that the item should be + * returned to the caller as a matching key! * - * -- Old comments: - * In order to avoid having to propagate changes up the tree any time - * a new minimal key is inserted, the leftmost entry on the leftmost - * page is less than all possible keys, by definition. - * - * -- New ones: - * New insertion code (fix against updating _in_place_ if new minimal - * key has bigger size than old one) may delete P_HIKEY entry on the - * root page in order to insert new minimal key - and so this definition - * does not work properly in this case and breaks key' order on root - * page. BTW, this propagation occures only while page' splitting, - * but not "any time a new min key is inserted" (see _bt_insertonpg). - * - vadim 12/05/96 + * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be + * "minus infinity": this routine will always claim it is less than the + * scankey. The actual key value stored (if any, which there probably isn't) + * does not matter. This convention allows us to implement the Lehman and + * Yao convention that the first down-link pointer is before the first key. + * See backend/access/nbtree/README for details. + *---------- */ -static int32 +int32 _bt_compare(Relation rel, - TupleDesc itupdesc, - Page page, int keysz, ScanKey scankey, + Page page, OffsetNumber offnum) { - Datum datum; + TupleDesc itupdesc = RelationGetDescr(rel); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); BTItem btitem; IndexTuple itup; - BTPageOpaque opaque; - ScanKey entry; - AttrNumber attno; - int32 result; int i; - bool null; /* - * If this is a leftmost internal page, and if our comparison is with - * the first key on the page, then the item at that position is by - * definition less than the scan key. - * - * - see new comments above... + * Force result ">" if target item is first data item on an internal + * page --- see NOTE above. */ - - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - - if (!(opaque->btpo_flags & BTP_LEAF) - && P_LEFTMOST(opaque) - && offnum == P_HIKEY) - { - - /* - * we just have to believe that this will only be called with - * offnum == P_HIKEY when P_HIKEY is the OffsetNumber of the first - * actual data key (i.e., this is also a rightmost page). there - * doesn't seem to be any code that implies that the leftmost page - * is normally missing a high key as well as the rightmost page. - * but that implies that this code path only applies to the root - * -- which seems unlikely.. - * - * - see new comments above... - */ - if (!P_RIGHTMOST(opaque)) - elog(ERROR, "_bt_compare: invalid comparison to high key"); - -#ifdef NOT_USED - - /* - * We just have to belive that right answer will not break - * anything. I've checked code and all seems to be ok. See new - * comments above... - * - * -- Old comments If the item on the page is equal to the scankey, - * that's okay to admit. We just can't claim that the first key - * on the page is greater than anything. - */ - - if (_bt_skeycmp(rel, keysz, scankey, page, PageGetItemId(page, offnum), - BTEqualStrategyNumber)) - return 0; + if (! P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque)) return 1; -#endif - } btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); itup = &(btitem->bti_itup); @@ -568,37 +312,45 @@ _bt_compare(Relation rel, * they be in order. If you think about how multi-key ordering works, * you'll understand why this is. * - * We don't test for violation of this condition here. + * We don't test for violation of this condition here, however. The + * initial setup for the index scan had better have gotten it right + * (see _bt_first). */ - for (i = 1; i <= keysz; i++) + for (i = 0; i < keysz; i++) { - entry = &scankey[i - 1]; - attno = entry->sk_attno; - datum = index_getattr(itup, attno, itupdesc, &null); + ScanKey entry = &scankey[i]; + Datum datum; + bool isNull; + int32 result; + + datum = index_getattr(itup, entry->sk_attno, itupdesc, &isNull); /* see comments about NULLs handling in btbuild */ - if (entry->sk_flags & SK_ISNULL) /* key is NULL */ + if (entry->sk_flags & SK_ISNULL) /* key is NULL */ { - if (null) + if (isNull) result = 0; /* NULL "=" NULL */ else result = 1; /* NULL ">" NOT_NULL */ } - else if (null) /* key is NOT_NULL and item is NULL */ + else if (isNull) /* key is NOT_NULL and item is NULL */ { result = -1; /* NOT_NULL "<" NULL */ } else + { result = DatumGetInt32(FunctionCall2(&entry->sk_func, - entry->sk_argument, datum)); + entry->sk_argument, + datum)); + } /* if the keys are unequal, return the difference */ if (result != 0) return result; } - /* by here, the keys are equal */ + /* if we get here, the keys are equal */ return 0; } @@ -606,10 +358,10 @@ _bt_compare(Relation rel, * _bt_next() -- Get the next item in a scan. * * On entry, we have a valid currentItemData in the scan, and a - * read lock on the page that contains that item. We do not have - * the page pinned. We return the next item in the scan. On - * exit, we have the page containing the next item locked but not - * pinned. + * read lock and pin count on the page that contains that item. + * We return the next item in the scan, or NULL if no more. + * On successful exit, the page containing the new item is locked + * and pinned; on NULL exit, no lock or pin is held. */ RetrieveIndexResult _bt_next(IndexScanDesc scan, ScanDirection dir) @@ -618,7 +370,6 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) Buffer buf; Page page; OffsetNumber offnum; - RetrieveIndexResult res; ItemPointer current; BTItem btitem; IndexTuple itup; @@ -629,10 +380,9 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) so = (BTScanOpaque) scan->opaque; current = &(scan->currentItemData); - Assert(BufferIsValid(so->btso_curbuf)); - /* we still have the buffer pinned and locked */ buf = so->btso_curbuf; + Assert(BufferIsValid(buf)); do { @@ -640,7 +390,7 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) if (!_bt_step(scan, &buf, dir)) return (RetrieveIndexResult) NULL; - /* by here, current is the tuple we want to return */ + /* current is the next candidate tuple to return */ offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); @@ -648,17 +398,16 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) if (_bt_checkkeys(scan, itup, &keysok)) { + /* tuple passes all scan key conditions, so return it */ Assert(keysok == so->numberOfKeys); - res = FormRetrieveIndexResult(current, &(itup->t_tid)); - - /* remember which buffer we have pinned and locked */ - so->btso_curbuf = buf; - return res; + return FormRetrieveIndexResult(current, &(itup->t_tid)); } + /* This tuple doesn't pass, but there might be more that do */ } while (keysok >= so->numberOfFirstKeys || (keysok == ((Size) -1) && ScanDirectionIsBackward(dir))); + /* No more items, so close down the current-item info */ ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; _bt_relbuf(rel, buf, BT_READ); @@ -680,14 +429,10 @@ RetrieveIndexResult _bt_first(IndexScanDesc scan, ScanDirection dir) { Relation rel; - TupleDesc itupdesc; Buffer buf; Page page; - BTPageOpaque pop; BTStack stack; - OffsetNumber offnum, - maxoff; - bool offGmax = false; + OffsetNumber offnum; BTItem btitem; IndexTuple itup; ItemPointer current; @@ -698,7 +443,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) int32 result; BTScanOpaque so; Size keysok; - bool strategyCheck; ScanKey scankeys = 0; int keysCount = 0; @@ -784,20 +528,17 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) return _bt_endpoint(scan, dir); } - itupdesc = RelationGetDescr(rel); - current = &(scan->currentItemData); - /* * Okay, we want something more complicated. What we'll do is use the * first item in the scan key passed in (which has been correctly * ordered to take advantage of index ordering) to position ourselves * at the right place in the scan. */ - /* _bt_orderkeys disallows it, but it's place to add some code latter */ scankeys = (ScanKey) palloc(keysCount * sizeof(ScanKeyData)); for (i = 0; i < keysCount; i++) { j = nKeyIs[i]; + /* _bt_orderkeys disallows it, but it's place to add some code latter */ if (so->keyData[j].sk_flags & SK_ISNULL) { pfree(nKeyIs); @@ -812,234 +553,213 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) if (nKeyIs) pfree(nKeyIs); - stack = _bt_search(rel, keysCount, scankeys, &buf); - _bt_freestack(stack); - - blkno = BufferGetBlockNumber(buf); - page = BufferGetPage(buf); + current = &(scan->currentItemData); /* - * This will happen if the tree we're searching is entirely empty, or - * if we're doing a search for a key that would appear on an entirely - * empty internal page. In either case, there are no matching tuples - * in the index. + * Use the manufactured scan key to descend the tree and position + * ourselves on the target leaf page. */ + stack = _bt_search(rel, keysCount, scankeys, &buf, BT_READ); - if (PageIsEmpty(page)) + /* don't need to keep the stack around... */ + _bt_freestack(stack); + + if (! BufferIsValid(buf)) { + /* Only get here if index is completely empty */ ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; - _bt_relbuf(rel, buf, BT_READ); pfree(scankeys); return (RetrieveIndexResult) NULL; } - maxoff = PageGetMaxOffsetNumber(page); - pop = (BTPageOpaque) PageGetSpecialPointer(page); - - /* - * Now _bt_moveright doesn't move from non-rightmost leaf page if - * scankey == hikey and there is only hikey there. It's good for - * insertion, but we need to do work for scan here. - vadim 05/27/97 - */ - - while (maxoff == P_HIKEY && !P_RIGHTMOST(pop) && - _bt_skeycmp(rel, keysCount, scankeys, page, - PageGetItemId(page, P_HIKEY), - BTGreaterEqualStrategyNumber)) - { - /* step right one page */ - blkno = pop->btpo_next; - _bt_relbuf(rel, buf, BT_READ); - buf = _bt_getbuf(rel, blkno, BT_READ); - page = BufferGetPage(buf); - if (PageIsEmpty(page)) - { - ItemPointerSetInvalid(current); - so->btso_curbuf = InvalidBuffer; - _bt_relbuf(rel, buf, BT_READ); - pfree(scankeys); - return (RetrieveIndexResult) NULL; - } - maxoff = PageGetMaxOffsetNumber(page); - pop = (BTPageOpaque) PageGetSpecialPointer(page); - } - - /* find the nearest match to the manufactured scan key on the page */ - offnum = _bt_binsrch(rel, buf, keysCount, scankeys, BT_DESCENT); + /* remember which buffer we have pinned */ + so->btso_curbuf = buf; + blkno = BufferGetBlockNumber(buf); + page = BufferGetPage(buf); - if (offnum > maxoff) - { - offnum = maxoff; - offGmax = true; - } + offnum = _bt_binsrch(rel, buf, keysCount, scankeys); ItemPointerSet(current, blkno, offnum); - /* - * Now find the right place to start the scan. Result is the value - * we're looking for minus the value we're looking at in the index. + /*---------- + * At this point we are positioned at the first item >= scan key, + * or possibly at the end of a page on which all the existing items + * are < scan key and we know that everything on later pages is + * >= scan key. We could step forward in the latter case, but that'd + * be a waste of time if we want to scan backwards. So, it's now time to + * examine the scan strategy to find the exact place to start the scan. + * + * Note: if _bt_step fails (meaning we fell off the end of the index + * in one direction or the other), we either return NULL (no matches) or + * call _bt_endpoint() to set up a scan starting at that index endpoint, + * as appropriate for the desired scan type. + * + * it's yet other place to add some code latter for is(not)null ... + *---------- */ - result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum); - - /* it's yet other place to add some code latter for is(not)null */ - - strat = strat_total; - switch (strat) + switch (strat_total) { case BTLessStrategyNumber: - if (result <= 0) + /* + * Back up one to arrive at last item < scankey + */ + if (!_bt_step(scan, &buf, BackwardScanDirection)) { - do - { - if (!_bt_twostep(scan, &buf, BackwardScanDirection)) - break; - - offnum = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum); - } while (result <= 0); - + pfree(scankeys); + return (RetrieveIndexResult) NULL; } break; case BTLessEqualStrategyNumber: - if (result >= 0) + /* + * We need to find the last item <= scankey, so step forward + * till we find one > scankey, then step back one. + */ + if (offnum > PageGetMaxOffsetNumber(page)) { - do + if (!_bt_step(scan, &buf, ForwardScanDirection)) { - if (!_bt_twostep(scan, &buf, ForwardScanDirection)) - break; - - offnum = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum); - } while (result >= 0); + pfree(scankeys); + return _bt_endpoint(scan, dir); + } + } + for (;;) + { + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + result = _bt_compare(rel, keysCount, scankeys, page, offnum); + if (result < 0) + break; + if (!_bt_step(scan, &buf, ForwardScanDirection)) + { + pfree(scankeys); + return _bt_endpoint(scan, dir); + } + } + if (!_bt_step(scan, &buf, BackwardScanDirection)) + { + pfree(scankeys); + return (RetrieveIndexResult) NULL; } - if (result < 0) - _bt_twostep(scan, &buf, BackwardScanDirection); break; case BTEqualStrategyNumber: - if (result != 0) + /* + * Make sure we are on the first equal item; might have to step + * forward if currently at end of page. + */ + if (offnum > PageGetMaxOffsetNumber(page)) { - _bt_relbuf(scan->relation, buf, BT_READ); - so->btso_curbuf = InvalidBuffer; - ItemPointerSetInvalid(&(scan->currentItemData)); - pfree(scankeys); - return (RetrieveIndexResult) NULL; + if (!_bt_step(scan, &buf, ForwardScanDirection)) + { + pfree(scankeys); + return (RetrieveIndexResult) NULL; + } + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); } - else if (ScanDirectionIsBackward(dir)) + result = _bt_compare(rel, keysCount, scankeys, page, offnum); + if (result != 0) + goto nomatches; /* no equal items! */ + /* + * If a backward scan was specified, need to start with last + * equal item not first one. + */ + if (ScanDirectionIsBackward(dir)) { do { - if (!_bt_twostep(scan, &buf, ForwardScanDirection)) - break; - + if (!_bt_step(scan, &buf, ForwardScanDirection)) + { + pfree(scankeys); + return _bt_endpoint(scan, dir); + } offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); - result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum); + result = _bt_compare(rel, keysCount, scankeys, page, offnum); } while (result == 0); - - if (result < 0) - _bt_twostep(scan, &buf, BackwardScanDirection); + if (!_bt_step(scan, &buf, BackwardScanDirection)) + elog(ERROR, "_bt_first: equal items disappeared?"); } break; case BTGreaterEqualStrategyNumber: - if (offGmax) + /* + * We want the first item >= scankey, which is where we are... + * unless we're not anywhere at all... + */ + if (offnum > PageGetMaxOffsetNumber(page)) { - if (result < 0) + if (!_bt_step(scan, &buf, ForwardScanDirection)) { - Assert(!P_RIGHTMOST(pop) && maxoff == P_HIKEY); - if (!_bt_step(scan, &buf, ForwardScanDirection)) - { - _bt_relbuf(scan->relation, buf, BT_READ); - so->btso_curbuf = InvalidBuffer; - ItemPointerSetInvalid(&(scan->currentItemData)); - pfree(scankeys); - return (RetrieveIndexResult) NULL; - } - } - else if (result > 0) - { /* Just remember: _bt_binsrch() returns - * the OffsetNumber of the first matching - * key on the page, or the OffsetNumber at - * which the matching key WOULD APPEAR IF - * IT WERE on this page. No key on this - * page, but offnum from _bt_binsrch() - * greater maxoff - have to move right. - - * vadim 12/06/96 */ - _bt_twostep(scan, &buf, ForwardScanDirection); + pfree(scankeys); + return (RetrieveIndexResult) NULL; } } - else if (result < 0) - { - do - { - if (!_bt_twostep(scan, &buf, BackwardScanDirection)) - break; - - page = BufferGetPage(buf); - offnum = ItemPointerGetOffsetNumber(current); - result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum); - } while (result < 0); - - if (result > 0) - _bt_twostep(scan, &buf, ForwardScanDirection); - } break; case BTGreaterStrategyNumber: - /* offGmax helps as above */ - if (result >= 0 || offGmax) + /* + * We want the first item > scankey, so make sure we are on + * an item and then step over any equal items. + */ + if (offnum > PageGetMaxOffsetNumber(page)) { - do + if (!_bt_step(scan, &buf, ForwardScanDirection)) { - if (!_bt_twostep(scan, &buf, ForwardScanDirection)) - break; - - offnum = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - result = _bt_compare(rel, itupdesc, page, keysCount, scankeys, offnum); - } while (result >= 0); + pfree(scankeys); + return (RetrieveIndexResult) NULL; + } + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + } + result = _bt_compare(rel, keysCount, scankeys, page, offnum); + while (result == 0) + { + if (!_bt_step(scan, &buf, ForwardScanDirection)) + { + pfree(scankeys); + return (RetrieveIndexResult) NULL; + } + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + result = _bt_compare(rel, keysCount, scankeys, page, offnum); } break; } - pfree(scankeys); /* okay, current item pointer for the scan is right */ offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); itup = &btitem->bti_itup; + /* is the first item actually acceptable? */ if (_bt_checkkeys(scan, itup, &keysok)) { + /* yes, return it */ res = FormRetrieveIndexResult(current, &(itup->t_tid)); - - /* remember which buffer we have pinned */ - so->btso_curbuf = buf; - } - else if (keysok >= so->numberOfFirstKeys) - { - so->btso_curbuf = buf; - return _bt_next(scan, dir); } - else if (keysok == ((Size) -1) && ScanDirectionIsBackward(dir)) + else if (keysok >= so->numberOfFirstKeys || + (keysok == ((Size) -1) && ScanDirectionIsBackward(dir))) { - so->btso_curbuf = buf; - return _bt_next(scan, dir); + /* no, but there might be another one that is */ + res = _bt_next(scan, dir); } else { + /* no tuples in the index match this scan key */ +nomatches: ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; _bt_relbuf(rel, buf, BT_READ); res = (RetrieveIndexResult) NULL; } + pfree(scankeys); + return res; } @@ -1047,276 +767,128 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * _bt_step() -- Step one item in the requested direction in a scan on * the tree. * - * If no adjacent record exists in the requested direction, return - * false. Else, return true and set the currentItemData for the - * scan to the right thing. + * *bufP is the current buffer (read-locked and pinned). If we change + * pages, it's updated appropriately. + * + * If successful, update scan's currentItemData and return true. + * If no adjacent record exists in the requested direction, + * release buffer pin/locks and return false. */ bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) { + Relation rel = scan->relation; + ItemPointer current = &(scan->currentItemData); + BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber offnum, maxoff; - OffsetNumber start; BlockNumber blkno; BlockNumber obknum; - BTScanOpaque so; - ItemPointer current; - Relation rel; - - rel = scan->relation; - current = &(scan->currentItemData); /* * Don't use ItemPointerGetOffsetNumber or you risk to get assertion * due to ability of ip_posid to be equal 0. */ offnum = current->ip_posid; + page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); - so = (BTScanOpaque) scan->opaque; maxoff = PageGetMaxOffsetNumber(page); - /* get the next tuple */ if (ScanDirectionIsForward(dir)) { if (!PageIsEmpty(page) && offnum < maxoff) offnum = OffsetNumberNext(offnum); else { - - /* if we're at end of scan, release the buffer and return */ - blkno = opaque->btpo_next; - if (P_RIGHTMOST(opaque)) - { - _bt_relbuf(rel, *bufP, BT_READ); - ItemPointerSetInvalid(current); - *bufP = so->btso_curbuf = InvalidBuffer; - return false; - } - else + /* walk right to the next page with data */ + for (;;) { - - /* walk right to the next page with data */ - _bt_relbuf(rel, *bufP, BT_READ); - for (;;) + /* if we're at end of scan, release the buffer and return */ + if (P_RIGHTMOST(opaque)) { - *bufP = _bt_getbuf(rel, blkno, BT_READ); - page = BufferGetPage(*bufP); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - maxoff = PageGetMaxOffsetNumber(page); - start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - - if (!PageIsEmpty(page) && start <= maxoff) - break; - else - { - blkno = opaque->btpo_next; - _bt_relbuf(rel, *bufP, BT_READ); - if (blkno == P_NONE) - { - *bufP = so->btso_curbuf = InvalidBuffer; - ItemPointerSetInvalid(current); - return false; - } - } + _bt_relbuf(rel, *bufP, BT_READ); + ItemPointerSetInvalid(current); + *bufP = so->btso_curbuf = InvalidBuffer; + return false; } - offnum = start; + /* step right one page */ + blkno = opaque->btpo_next; + _bt_relbuf(rel, *bufP, BT_READ); + *bufP = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + /* done if it's not empty */ + offnum = P_FIRSTDATAKEY(opaque); + if (!PageIsEmpty(page) && offnum <= maxoff) + break; } } } - else if (ScanDirectionIsBackward(dir)) + else { - - /* remember that high key is item zero on non-rightmost pages */ - start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - - if (offnum > start) + if (offnum > P_FIRSTDATAKEY(opaque)) offnum = OffsetNumberPrev(offnum); else { - - /* if we're at end of scan, release the buffer and return */ - blkno = opaque->btpo_prev; - if (P_LEFTMOST(opaque)) - { - _bt_relbuf(rel, *bufP, BT_READ); - *bufP = so->btso_curbuf = InvalidBuffer; - ItemPointerSetInvalid(current); - return false; - } - else + /* walk left to the next page with data */ + for (;;) { - + /* if we're at end of scan, release the buffer and return */ + if (P_LEFTMOST(opaque)) + { + _bt_relbuf(rel, *bufP, BT_READ); + ItemPointerSetInvalid(current); + *bufP = so->btso_curbuf = InvalidBuffer; + return false; + } + /* step left */ obknum = BufferGetBlockNumber(*bufP); - - /* walk right to the next page with data */ + blkno = opaque->btpo_prev; _bt_relbuf(rel, *bufP, BT_READ); - for (;;) + *bufP = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(*bufP); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + /* + * If the adjacent page just split, then we have to walk + * right to find the block that's now adjacent to where + * we were. Because pages only split right, we don't have + * to worry about this failing to terminate. + */ + while (opaque->btpo_next != obknum) { + blkno = opaque->btpo_next; + _bt_relbuf(rel, *bufP, BT_READ); *bufP = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); - maxoff = PageGetMaxOffsetNumber(page); - - /* - * If the adjacent page just split, then we may have - * the wrong block. Handle this case. Because pages - * only split right, we don't have to worry about this - * failing to terminate. - */ - - while (opaque->btpo_next != obknum) - { - blkno = opaque->btpo_next; - _bt_relbuf(rel, *bufP, BT_READ); - *bufP = _bt_getbuf(rel, blkno, BT_READ); - page = BufferGetPage(*bufP); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - maxoff = PageGetMaxOffsetNumber(page); - } - - /* don't consider the high key */ - start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - - /* anything to look at here? */ - if (!PageIsEmpty(page) && maxoff >= start) - break; - else - { - blkno = opaque->btpo_prev; - obknum = BufferGetBlockNumber(*bufP); - _bt_relbuf(rel, *bufP, BT_READ); - if (blkno == P_NONE) - { - *bufP = so->btso_curbuf = InvalidBuffer; - ItemPointerSetInvalid(current); - return false; - } - } } - offnum = maxoff;/* XXX PageIsEmpty? */ + /* done if it's not empty */ + maxoff = PageGetMaxOffsetNumber(page); + offnum = maxoff; + if (!PageIsEmpty(page) && maxoff >= P_FIRSTDATAKEY(opaque)) + break; } } } - blkno = BufferGetBlockNumber(*bufP); + + /* Update scan state */ so->btso_curbuf = *bufP; + blkno = BufferGetBlockNumber(*bufP); ItemPointerSet(current, blkno, offnum); return true; } /* - * _bt_twostep() -- Move to an adjacent record in a scan on the tree, - * if an adjacent record exists. - * - * This is like _bt_step, except that if no adjacent record exists - * it restores us to where we were before trying the step. This is - * only hairy when you cross page boundaries, since the page you cross - * from could have records inserted or deleted, or could even split. - * This is unlikely, but we try to handle it correctly here anyway. - * - * This routine contains the only case in which our changes to Lehman - * and Yao's algorithm. - * - * Like step, this routine leaves the scan's currentItemData in the - * proper state and acquires a lock and pin on *bufP. If the twostep - * succeeded, we return true; otherwise, we return false. - */ -static bool -_bt_twostep(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) -{ - Page page; - BTPageOpaque opaque; - OffsetNumber offnum, - maxoff; - OffsetNumber start; - ItemPointer current; - ItemId itemid; - int itemsz; - BTItem btitem; - BTItem svitem; - BlockNumber blkno; - - blkno = BufferGetBlockNumber(*bufP); - page = BufferGetPage(*bufP); - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - maxoff = PageGetMaxOffsetNumber(page); - current = &(scan->currentItemData); - offnum = ItemPointerGetOffsetNumber(current); - - start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - - /* if we're safe, just do it */ - if (ScanDirectionIsForward(dir) && offnum < maxoff) - { /* XXX PageIsEmpty? */ - ItemPointerSet(current, blkno, OffsetNumberNext(offnum)); - return true; - } - else if (ScanDirectionIsBackward(dir) && offnum > start) - { - ItemPointerSet(current, blkno, OffsetNumberPrev(offnum)); - return true; - } - - /* if we've hit end of scan we don't have to do any work */ - if (ScanDirectionIsForward(dir) && P_RIGHTMOST(opaque)) - return false; - else if (ScanDirectionIsBackward(dir) && P_LEFTMOST(opaque)) - return false; - - /* - * Okay, it's off the page; let _bt_step() do the hard work, and we'll - * try to remember where we were. This is not guaranteed to work; - * this is the only place in the code where concurrency can screw us - * up, and it's because we want to be able to move in two directions - * in the scan. - */ - - itemid = PageGetItemId(page, offnum); - itemsz = ItemIdGetLength(itemid); - btitem = (BTItem) PageGetItem(page, itemid); - svitem = (BTItem) palloc(itemsz); - memmove((char *) svitem, (char *) btitem, itemsz); - - if (_bt_step(scan, bufP, dir)) - { - pfree(svitem); - return true; - } - - /* try to find our place again */ - *bufP = _bt_getbuf(scan->relation, blkno, BT_READ); - page = BufferGetPage(*bufP); - maxoff = PageGetMaxOffsetNumber(page); - - while (offnum <= maxoff) - { - itemid = PageGetItemId(page, offnum); - btitem = (BTItem) PageGetItem(page, itemid); - if (BTItemSame(btitem, svitem)) - { - pfree(svitem); - ItemPointerSet(current, blkno, offnum); - return false; - } - } - - /* - * XXX crash and burn -- can't find our place. We can be a little - * smarter -- walk to the next page to the right, for example, since - * that's the only direction that splits happen in. Deletions screw - * us up less often since they're only done by the vacuum daemon. - */ - - elog(ERROR, "btree synchronization error: concurrent update botched scan"); - - return false; -} - -/* * _bt_endpoint() -- Find the first or last key in the index. + * + * This is used by _bt_first() to set up a scan when we've determined + * that the scan must start at the beginning or end of the index (for + * a forward or backward scan respectively). */ static RetrieveIndexResult _bt_endpoint(IndexScanDesc scan, ScanDirection dir) @@ -1328,7 +900,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) ItemPointer current; OffsetNumber offnum, maxoff; - OffsetNumber start = 0; + OffsetNumber start; BlockNumber blkno; BTItem btitem; IndexTuple itup; @@ -1340,38 +912,50 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) current = &(scan->currentItemData); so = (BTScanOpaque) scan->opaque; + /* + * Scan down to the leftmost or rightmost leaf page. This is a + * simplified version of _bt_search(). We don't maintain a stack + * since we know we won't need it. + */ buf = _bt_getroot(rel, BT_READ); + + if (! BufferIsValid(buf)) + { + /* empty index... */ + ItemPointerSetInvalid(current); + so->btso_curbuf = InvalidBuffer; + return (RetrieveIndexResult) NULL; + } + blkno = BufferGetBlockNumber(buf); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); for (;;) { - if (opaque->btpo_flags & BTP_LEAF) + if (P_ISLEAF(opaque)) break; if (ScanDirectionIsForward(dir)) - offnum = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; + offnum = P_FIRSTDATAKEY(opaque); else offnum = PageGetMaxOffsetNumber(page); btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); itup = &(btitem->bti_itup); - blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); _bt_relbuf(rel, buf, BT_READ); buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* - * Race condition: If the child page we just stepped onto is in - * the process of being split, we need to make sure we're all the - * way at the right edge of the tree. See the paper by Lehman and - * Yao. + * Race condition: If the child page we just stepped onto was just + * split, we need to make sure we're all the way at the right edge + * of the tree. See the paper by Lehman and Yao. */ - if (ScanDirectionIsBackward(dir) && !P_RIGHTMOST(opaque)) { do @@ -1390,101 +974,39 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) if (ScanDirectionIsForward(dir)) { - if (!P_LEFTMOST(opaque))/* non-leftmost page ? */ - elog(ERROR, "_bt_endpoint: leftmost page (%u) has not leftmost flag", blkno); - start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY; - - /* - * I don't understand this stuff! It doesn't work for - * non-rightmost pages with only one element (P_HIKEY) which we - * have after deletion itups by vacuum (it's case of start > - * maxoff). Scanning in BackwardScanDirection is not - * understandable at all. Well - new stuff. - vadim 12/06/96 - */ -#ifdef NOT_USED - if (PageIsEmpty(page) || start > maxoff) - { - ItemPointerSet(current, blkno, maxoff); - if (!_bt_step(scan, &buf, BackwardScanDirection)) - return (RetrieveIndexResult) NULL; - - start = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - } -#endif - if (PageIsEmpty(page)) - { - if (start != P_HIKEY) /* non-rightmost page */ - elog(ERROR, "_bt_endpoint: non-rightmost page (%u) is empty", blkno); + Assert(P_LEFTMOST(opaque)); - /* - * It's left- & right- most page - root page, - and it's - * empty... - */ - _bt_relbuf(rel, buf, BT_READ); - ItemPointerSetInvalid(current); - so->btso_curbuf = InvalidBuffer; - return (RetrieveIndexResult) NULL; - } - if (start > maxoff) /* start == 2 && maxoff == 1 */ - { - ItemPointerSet(current, blkno, maxoff); - if (!_bt_step(scan, &buf, ForwardScanDirection)) - return (RetrieveIndexResult) NULL; - - start = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - } - /* new stuff ends here */ - else - ItemPointerSet(current, blkno, start); + start = P_FIRSTDATAKEY(opaque); } else if (ScanDirectionIsBackward(dir)) { + Assert(P_RIGHTMOST(opaque)); - /* - * I don't understand this stuff too! If RIGHT-most leaf page is - * empty why do scanning in ForwardScanDirection ??? Well - new - * stuff. - vadim 12/06/96 - */ -#ifdef NOT_USED - if (PageIsEmpty(page)) - { - ItemPointerSet(current, blkno, FirstOffsetNumber); - if (!_bt_step(scan, &buf, ForwardScanDirection)) - return (RetrieveIndexResult) NULL; - - start = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - } -#endif - if (PageIsEmpty(page)) - { - /* If it's leftmost page too - it's empty root page... */ - if (P_LEFTMOST(opaque)) - { - _bt_relbuf(rel, buf, BT_READ); - ItemPointerSetInvalid(current); - so->btso_curbuf = InvalidBuffer; - return (RetrieveIndexResult) NULL; - } - /* Go back ! */ - ItemPointerSet(current, blkno, FirstOffsetNumber); - if (!_bt_step(scan, &buf, BackwardScanDirection)) - return (RetrieveIndexResult) NULL; - - start = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - } - /* new stuff ends here */ - else - { - start = PageGetMaxOffsetNumber(page); - ItemPointerSet(current, blkno, start); - } + start = PageGetMaxOffsetNumber(page); + if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty page */ + start = P_FIRSTDATAKEY(opaque); } else + { elog(ERROR, "Illegal scan direction %d", dir); + start = 0; /* keep compiler quiet */ + } + + ItemPointerSet(current, blkno, start); + /* remember which buffer we have pinned */ + so->btso_curbuf = buf; + + /* + * Left/rightmost page could be empty due to deletions, + * if so step till we find a nonempty page. + */ + if (start > maxoff) + { + if (!_bt_step(scan, &buf, dir)) + return (RetrieveIndexResult) NULL; + start = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + } btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start)); itup = &(btitem->bti_itup); @@ -1492,23 +1014,18 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) /* see if we picked a winner */ if (_bt_checkkeys(scan, itup, &keysok)) { + /* yes, return it */ res = FormRetrieveIndexResult(current, &(itup->t_tid)); - - /* remember which buffer we have pinned */ - so->btso_curbuf = buf; - } - else if (keysok >= so->numberOfFirstKeys) - { - so->btso_curbuf = buf; - return _bt_next(scan, dir); } - else if (keysok == ((Size) -1) && ScanDirectionIsBackward(dir)) + else if (keysok >= so->numberOfFirstKeys || + (keysok == ((Size) -1) && ScanDirectionIsBackward(dir))) { - so->btso_curbuf = buf; - return _bt_next(scan, dir); + /* no, but there might be another one that is */ + res = _bt_next(scan, dir); } else { + /* no tuples in the index match this scan key */ ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; _bt_relbuf(rel, buf, BT_READ); |