aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/nbtree/README8
-rw-r--r--src/backend/access/nbtree/nbtinsert.c46
-rw-r--r--src/backend/access/nbtree/nbtpage.c3
-rw-r--r--src/backend/access/nbtree/nbtsearch.c19
-rw-r--r--src/include/access/nbtree.h16
5 files changed, 50 insertions, 42 deletions
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index dce8bc98e93..c8bdb0c935d 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -224,7 +224,13 @@ it, but it's still linked to its siblings.
(Note: Lanin and Shasha prefer to make the key space move left, but their
argument for doing so hinges on not having left-links, which we have
-anyway. So we simplify the algorithm by moving key space right.)
+anyway. So we simplify the algorithm by moving the key space right. Note
+also that Lanin and Shasha optimistically avoid holding multiple locks as
+the tree is ascended. They're willing to release all locks and retry in
+"rare" cases where the correct location for a new downlink cannot be found
+immediately. We prefer to stick with Lehman and Yao's approach of
+pessimistically coupling buffer locks when ascending the tree, since it's
+far simpler.)
To preserve consistency on the parent level, we cannot merge the key space
of a page into its right sibling unless the right sibling is a child of
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 5890f393f67..48d19be3aab 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -1797,7 +1797,6 @@ _bt_insert_parent(Relation rel,
stack = &fakestack;
stack->bts_blkno = BufferGetBlockNumber(pbuf);
stack->bts_offset = InvalidOffsetNumber;
- stack->bts_btentry = InvalidBlockNumber;
stack->bts_parent = NULL;
_bt_relbuf(rel, pbuf);
}
@@ -1819,8 +1818,7 @@ _bt_insert_parent(Relation rel,
* new downlink will be inserted at the correct offset. Even buf's
* parent may have changed.
*/
- stack->bts_btentry = bknum;
- pbuf = _bt_getstackbuf(rel, stack);
+ pbuf = _bt_getstackbuf(rel, stack, bknum);
/*
* Now we can unlock the right child. The left child will be unlocked
@@ -1834,7 +1832,7 @@ _bt_insert_parent(Relation rel,
errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
RelationGetRelationName(rel), bknum, rbknum)));
- /* Recursively update the parent */
+ /* Recursively insert into the parent */
_bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
new_item, stack->bts_offset + 1,
is_only);
@@ -1901,21 +1899,37 @@ _bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
}
/*
- * _bt_getstackbuf() -- Walk back up the tree one step, and find the item
- * we last looked at in the parent.
+ * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
+ * tuple whose downlink points to child page.
*
- * This is possible because we save the downlink from the parent item,
- * which is enough to uniquely identify it. Insertions into the parent
- * level could cause the item to move right; deletions could cause it
- * to move left, but not left of the page we previously found it in.
+ * Caller passes child's block number, which is used to identify
+ * associated pivot tuple in parent page using a linear search that
+ * matches on pivot's downlink/block number. The expected location of
+ * the pivot tuple is taken from the stack one level above the child
+ * page. This is used as a starting point. Insertions into the
+ * parent level could cause the pivot tuple to move right; deletions
+ * could cause it to move left, but not left of the page we previously
+ * found it on.
*
- * Adjusts bts_blkno & bts_offset if changed.
+ * Caller can use its stack to relocate the pivot tuple/downlink for
+ * any same-level page to the right of the page found by its initial
+ * descent. This is necessary because of the possibility that caller
+ * moved right to recover from a concurrent page split. It's also
+ * convenient for certain callers to be able to step right when there
+ * wasn't a concurrent page split, while still using their original
+ * stack. For example, the checkingunique _bt_doinsert() case may
+ * have to step right when there are many physical duplicates, and its
+ * scantid forces an insertion to the right of the "first page the
+ * value could be on".
*
- * Returns write-locked buffer, or InvalidBuffer if item not found
- * (should not happen).
+ * Returns write-locked parent page buffer, or InvalidBuffer if pivot
+ * tuple not found (should not happen). Adjusts bts_blkno &
+ * bts_offset if changed. Page split caller should insert its new
+ * pivot tuple for its new right sibling page on parent page, at the
+ * offset number bts_offset + 1.
*/
Buffer
-_bt_getstackbuf(Relation rel, BTStack stack)
+_bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child)
{
BlockNumber blkno;
OffsetNumber start;
@@ -1977,7 +1991,7 @@ _bt_getstackbuf(Relation rel, BTStack stack)
itemid = PageGetItemId(page, offnum);
item = (IndexTuple) PageGetItem(page, itemid);
- if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
+ if (BTreeInnerTupleGetDownLink(item) == child)
{
/* Return accurate pointer to where link is now */
stack->bts_blkno = blkno;
@@ -1993,7 +2007,7 @@ _bt_getstackbuf(Relation rel, BTStack stack)
itemid = PageGetItemId(page, offnum);
item = (IndexTuple) PageGetItem(page, itemid);
- if (BTreeInnerTupleGetDownLink(item) == stack->bts_btentry)
+ if (BTreeInnerTupleGetDownLink(item) == child)
{
/* Return accurate pointer to where link is now */
stack->bts_blkno = blkno;
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 7c0e6d13a94..18c6de21c1c 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -1189,8 +1189,7 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
* non-unique high keys in leaf level pages. Even heapkeyspace indexes
* can have a stale stack due to insertions into the parent.
*/
- stack->bts_btentry = child;
- pbuf = _bt_getstackbuf(rel, stack);
+ pbuf = _bt_getstackbuf(rel, stack, child);
if (pbuf == InvalidBuffer)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 19735bf7330..7f77ed24c5c 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -146,23 +146,16 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
par_blkno = BufferGetBlockNumber(*bufP);
/*
- * We need to save the location of the index entry we chose in the
- * parent page on a stack. In case we split the tree, we'll use the
- * stack to work back up to the parent page. We also save the actual
- * downlink (block) to uniquely identify the index entry, in case it
- * moves right while we're working lower in the tree. See the paper
- * by Lehman and Yao for how this is detected and handled. (We use the
- * child link during the second half of a page split -- if caller ends
- * up splitting the child it usually ends up inserting a new pivot
- * tuple for child's new right sibling immediately after the original
- * bts_offset offset recorded here. The downlink block will be needed
- * to check if bts_offset remains the position of this same pivot
- * tuple.)
+ * We need to save the location of the pivot tuple we chose in the
+ * parent page on a stack. If we need to split a page, we'll use
+ * the stack to work back up to its parent page. If caller ends up
+ * splitting a page one level down, it usually ends up inserting a
+ * new pivot tuple/downlink immediately after the location recorded
+ * here.
*/
new_stack = (BTStack) palloc(sizeof(BTStackData));
new_stack->bts_blkno = par_blkno;
new_stack->bts_offset = offnum;
- new_stack->bts_btentry = blkno;
new_stack->bts_parent = stack_in;
/*
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 83e0e6c28e0..7e54c456f7d 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -403,20 +403,16 @@ typedef struct BTMetaPageData
#define BT_WRITE BUFFER_LOCK_EXCLUSIVE
/*
- * BTStackData -- As we descend a tree, we push the (location, downlink)
- * pairs from internal pages onto a private stack. If we split a
- * leaf, we use this stack to walk back up the tree and insert data
- * into parent pages (and possibly to split them, too). Lehman and
- * Yao's update algorithm guarantees that under no circumstances can
- * our private stack give us an irredeemably bad picture up the tree.
- * Again, see the paper for details.
+ * BTStackData -- As we descend a tree, we push the location of pivot
+ * tuples whose downlink we are about to follow onto a private stack. If
+ * we split a leaf, we use this stack to walk back up the tree and insert
+ * data into its parent page at the correct location. We may also have to
+ * recursively split a grandparent of the leaf page (and so on).
*/
-
typedef struct BTStackData
{
BlockNumber bts_blkno;
OffsetNumber bts_offset;
- BlockNumber bts_btentry;
struct BTStackData *bts_parent;
} BTStackData;
@@ -731,7 +727,7 @@ extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
*/
extern bool _bt_doinsert(Relation rel, IndexTuple itup,
IndexUniqueCheck checkUnique, Relation heapRel);
-extern Buffer _bt_getstackbuf(Relation rel, BTStack stack);
+extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, BlockNumber child);
extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack);
/*