diff options
Diffstat (limited to 'src/backend/access/nbtree/nbtree.c')
-rw-r--r-- | src/backend/access/nbtree/nbtree.c | 155 |
1 files changed, 72 insertions, 83 deletions
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index b7a1e7ada1c..d73af9e6782 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -12,7 +12,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.97 2003/02/23 06:17:13 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.98 2003/02/23 22:43:08 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -572,121 +572,110 @@ btbulkdelete(PG_FUNCTION_ARGS) IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1); void *callback_state = (void *) PG_GETARG_POINTER(2); IndexBulkDeleteResult *result; - BlockNumber num_pages; double tuples_removed; double num_index_tuples; - IndexScanDesc scan; - BTScanOpaque so; - ItemPointer current; + OffsetNumber deletable[BLCKSZ / sizeof(OffsetNumber)]; + int ndeletable; + Buffer buf; + BlockNumber num_pages; tuples_removed = 0; num_index_tuples = 0; /* - * We use a standard IndexScanDesc scan object, but to speed up the - * loop, we skip most of the wrapper layers of index_getnext and - * instead call _bt_step directly. This implies holding buffer lock - * on a target page throughout the loop over the page's tuples. - * - * Whenever we step onto a new page, we have to trade in the read - * lock acquired by _bt_first or _bt_step for an exclusive write lock - * (fortunately, _bt_relbuf doesn't care which kind of lock it's - * releasing when it comes time for _bt_step to release our lock). + * The outer loop iterates over index leaf pages, the inner over items + * on a leaf page. We issue just one _bt_delitems() call per page, + * so as to minimize WAL traffic. * - * Note that we exclusive-lock every leaf page, or at least every one - * containing data items. It sounds attractive to only exclusive-lock + * Note that we exclusive-lock every leaf page containing data items, + * in sequence left to right. It sounds attractive to only exclusive-lock * those containing items we need to delete, but unfortunately that * is not safe: we could then pass a stopped indexscan, which could * in rare cases lead to deleting the item it needs to find when it * resumes. (See _bt_restscan --- this could only happen if an indexscan * stops on a deletable item and then a page split moves that item * into a page further to its right, which the indexscan will have no - * pin on.) + * pin on.) We can skip obtaining exclusive lock on empty pages + * though, since no indexscan could be stopped on those. */ - scan = index_beginscan(NULL, rel, SnapshotAny, 0, (ScanKey) NULL); - so = (BTScanOpaque) scan->opaque; - current = &(scan->currentItemData); - - /* Use _bt_first to get started, then _bt_step to remaining tuples */ - if (_bt_first(scan, ForwardScanDirection)) + buf = _bt_get_endpoint(rel, 0, false); + if (BufferIsValid(buf)) /* check for empty index */ { - Buffer buf; - BlockNumber lockedBlock = InvalidBlockNumber; - - /* we have the buffer pinned and read-locked */ - buf = so->btso_curbuf; - Assert(BufferIsValid(buf)); - - do + for (;;) { Page page; - BlockNumber blkno; - OffsetNumber offnum; - BTItem btitem; BTPageOpaque opaque; - IndexTuple itup; - ItemPointer htup; + OffsetNumber offnum, + minoff, + maxoff; + BlockNumber nextpage; CHECK_FOR_INTERRUPTS(); - /* current is the next index tuple */ + ndeletable = 0; page = BufferGetPage(buf); - blkno = ItemPointerGetBlockNumber(current); - - /* - * Make sure we have a super-exclusive write lock on this page. - * - * We assume that only concurrent insertions, not deletions, - * can occur while we're not holding the page lock (the - * caller should hold a suitable relation lock to ensure - * this). Therefore, no items can escape being scanned because - * of this temporary lock release. - */ - if (blkno != lockedBlock) + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + /* We probably cannot see deleted pages, but skip 'em if so */ + if (minoff <= maxoff && !P_ISDELETED(opaque)) { + /* + * Trade in the initial read lock for a super-exclusive + * write lock on this page. + */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); - lockedBlock = blkno; /* - * If the page was formerly rightmost but was split while we - * didn't hold the lock, and ip_posid is pointing to item - * 1, then ip_posid now points at the high key not a valid - * data item. In this case we need to step forward. + * Recompute minoff/maxoff, both of which could have changed + * while we weren't holding the lock. */ - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - if (current->ip_posid < P_FIRSTDATAKEY(opaque)) - current->ip_posid = P_FIRSTDATAKEY(opaque); - } - - offnum = ItemPointerGetOffsetNumber(current); - btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); - itup = &btitem->bti_itup; - htup = &(itup->t_tid); - - if (callback(htup, callback_state)) - { - /* Okay to delete the item from the page */ - _bt_itemdel(rel, buf, current); - - /* Mark buffer dirty, but keep the lock and pin */ - WriteNoReleaseBuffer(buf); - - tuples_removed += 1; - + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); /* - * We now need to back up the scan one item, so that the next - * cycle will re-examine the same offnum on this page (which - * now holds the next item). + * Scan over all items to see which ones need deleted + * according to the callback function. */ - current->ip_posid--; + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + BTItem btitem; + ItemPointer htup; + + btitem = (BTItem) PageGetItem(page, + PageGetItemId(page, offnum)); + htup = &(btitem->bti_itup.t_tid); + if (callback(htup, callback_state)) + { + deletable[ndeletable++] = offnum; + tuples_removed += 1; + } + else + num_index_tuples += 1; + } + } + /* + * If we need to delete anything, do it and write the buffer; + * else just release the buffer. + */ + nextpage = opaque->btpo_next; + if (ndeletable > 0) + { + _bt_delitems(rel, buf, deletable, ndeletable); + _bt_wrtbuf(rel, buf); } else - num_index_tuples += 1; - } while (_bt_step(scan, &buf, ForwardScanDirection)); + { + _bt_relbuf(rel, buf); + } + /* And advance to next page, if any */ + if (nextpage == P_NONE) + break; + buf = _bt_getbuf(rel, nextpage, BT_READ); + } } - index_endscan(scan); - /* return statistics */ num_pages = RelationGetNumberOfBlocks(rel); @@ -765,7 +754,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS) } } else if ((opaque->btpo_flags & BTP_HALF_DEAD) || - P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)) + P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)) { /* Empty, try to delete */ int ndel; |