aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/access/nbtree/README23
-rw-r--r--src/backend/access/nbtree/nbtinsert.c6
-rw-r--r--src/backend/access/nbtree/nbtpage.c24
3 files changed, 40 insertions, 13 deletions
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index b5033628348..187c147b0af 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.8 2003/11/29 19:51:40 pgsql Exp $
+$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.8.4.1 2006/11/01 19:50:08 tgl Exp $
This directory contains a correct implementation of Lehman and Yao's
high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
@@ -189,9 +189,24 @@ the half-dead page's level and its parent's level may be a little out of
whack: key space that appears to belong to the half-dead page's parent on the
parent level may really belong to its right sibling. We can tolerate this,
however, because insertions and deletions on upper tree levels are always
-done by reference to child page numbers, not keys. The only cost is that
-searches may sometimes descend to the half-dead page and then have to move
-right, rather than going directly to the sibling page.
+done by reference to child page numbers, not keys. Searches may sometimes
+descend to the half-dead page and then have to move right, rather than going
+directly to the sibling page, but this is no different from the behavior
+during a split.
+
+A special case that arises from using half-dead pages for rightmost children
+is that it's possible for the grandparent level's sequence of keys to become
+out-of-order. This occurs when there are a large number of insertions into
+the key space that's been implicitly transferred to the right sibling of the
+half-dead page's parent. If the right sibling itself splits, the split
+bounding key (which could be less than the high key of the parent page) is
+inserted into the grandparent level to the right of the parent page. This
+is pretty ugly, but it causes no serious damage. Searches, again, may descend
+a bit to the left of the optimal path but will be able to recover. The only
+problem is that when it comes time to delete the half-dead page, _bt_pagedel's
+normal strategy for finding the target page's parent can fail: the search for
+the page's high key may well descend to the right of the parent. In this case
+we recover by searching from the left end of the parent level.
A deleted page cannot be reclaimed immediately, since there may be other
processes waiting to reference it (ie, search processes that just left the
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 1c5060c0cd3..fcdb7045dcf 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.119.4.1 2005/10/12 17:18:15 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.119.4.2 2006/11/01 19:50:08 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -1243,8 +1243,8 @@ _bt_insert_parent(Relation rel,
/* Check for error only after writing children */
if (pbuf == InvalidBuffer)
- elog(ERROR, "failed to re-find parent key in \"%s\"",
- RelationGetRelationName(rel));
+ elog(ERROR, "failed to re-find parent key in \"%s\" for split pages %u/%u",
+ RelationGetRelationName(rel), bknum, rbknum);
/* Recursively update the parent */
newres = _bt_insertonpg(rel, pbuf, stack->bts_parent,
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 378c5f9d36f..03b8ca3b4ab 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.81.4.1 2005/05/07 21:32:53 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.81.4.2 2006/11/01 19:50:08 tgl Exp $
*
* NOTES
* Postgres btree pages look like ordinary relation pages. The opaque
@@ -867,16 +867,28 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
/*
- * Next find and write-lock the current parent of the target page.
- * This is essentially the same as the corresponding step of
- * splitting.
+ * Next find and write-lock the current parent of the target page. This is
+ * essentially the same as the corresponding step of splitting. However,
+ * it's possible for the search to fail (for reasons explained in README).
+ * If that happens, we recover by searching the whole parent level, which
+ * is a tad inefficient but doesn't happen often enough to be a problem.
*/
ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid),
target, P_HIKEY);
pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
if (pbuf == InvalidBuffer)
- elog(ERROR, "failed to re-find parent key in \"%s\"",
- RelationGetRelationName(rel));
+ {
+ /* Find the leftmost page in the parent level */
+ pbuf = _bt_get_endpoint(rel, opaque->btpo.level + 1, false);
+ stack->bts_blkno = BufferGetBlockNumber(pbuf);
+ stack->bts_offset = InvalidOffsetNumber;
+ _bt_relbuf(rel, pbuf);
+ /* and repeat search from there */
+ pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
+ if (pbuf == InvalidBuffer)
+ elog(ERROR, "failed to re-find parent key in \"%s\" for deletion target page %u",
+ RelationGetRelationName(rel), target);
+ }
parent = stack->bts_blkno;
poffset = stack->bts_offset;