aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtsearch.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/nbtree/nbtsearch.c')
-rw-r--r--src/backend/access/nbtree/nbtsearch.c68
1 files changed, 56 insertions, 12 deletions
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index bc62fbc02e6..0bf12f0e107 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -51,7 +51,8 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
* NOTE that the returned buffer is read-locked regardless of the access
* parameter. However, access = BT_WRITE will allow an empty root page
* to be created and returned. When access = BT_READ, an empty index
- * will result in *bufP being set to InvalidBuffer.
+ * will result in *bufP being set to InvalidBuffer. Also, in BT_WRITE mode,
+ * any incomplete splits encountered during the search will be finished.
*/
BTStack
_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
@@ -82,8 +83,17 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
* Race -- the page we just grabbed may have split since we read its
* pointer in the parent (or metapage). If it has, we may need to
* move right to its new sibling. Do that.
+ *
+ * In write-mode, allow _bt_moveright to finish any incomplete splits
+ * along the way. Strictly speaking, we'd only need to finish an
+ * incomplete split on the leaf page we're about to insert to, not on
+ * any of the upper levels (they is taken care of in _bt_getstackbuf,
+ * if the leaf page is split and we insert to the parent page). But
+ * this is a good opportunity to finish splits of internal pages too.
*/
- *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);
+ *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
+ (access == BT_WRITE), stack_in,
+ BT_READ);
/* if this is a leaf page, we're done */
page = BufferGetPage(*bufP);
@@ -148,6 +158,11 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
* item >= scankey. When nextkey is true, we are looking for the first
* item strictly greater than scankey.
*
+ * If forupdate is true, we will attempt to finish any incomplete splits
+ * that we encounter. This is required when locking a target page for an
+ * insertion, because we don't allow inserting on a page before the split
+ * is completed. 'stack' is only used if forupdate is true.
+ *
* On entry, we have the buffer pinned and a lock of the type specified by
* 'access'. If we move right, we release the buffer and lock and acquire
* the same on the right sibling. Return value is the buffer we stop at.
@@ -158,15 +173,14 @@ _bt_moveright(Relation rel,
int keysz,
ScanKey scankey,
bool nextkey,
+ bool forupdate,
+ BTStack stack,
int access)
{
Page page;
BTPageOpaque opaque;
int32 cmpval;
- page = BufferGetPage(buf);
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
/*
* When nextkey = false (normal case): if the scan key that brought us to
* this page is > the high key stored on the page, then the page has split
@@ -184,16 +198,46 @@ _bt_moveright(Relation rel,
*/
cmpval = nextkey ? 0 : 1;
- while (!P_RIGHTMOST(opaque) &&
- (P_IGNORE(opaque) ||
- _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))
+ for (;;)
{
- /* step right one page */
- BlockNumber rblkno = opaque->btpo_next;
-
- buf = _bt_relandgetbuf(rel, buf, rblkno, access);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ if (P_RIGHTMOST(opaque))
+ break;
+
+ /*
+ * Finish any incomplete splits we encounter along the way.
+ */
+ if (forupdate && P_INCOMPLETE_SPLIT(opaque))
+ {
+ BlockNumber blkno = BufferGetBlockNumber(buf);
+
+ /* upgrade our lock if necessary */
+ if (access == BT_READ)
+ {
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ LockBuffer(buf, BT_WRITE);
+ }
+
+ if (P_INCOMPLETE_SPLIT(opaque))
+ _bt_finish_split(rel, buf, stack);
+ else
+ _bt_relbuf(rel, buf);
+
+ /* re-acquire the lock in the right mode, and re-check */
+ buf = _bt_getbuf(rel, blkno, access);
+ continue;
+ }
+
+ if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
+ {
+ /* step right one page */
+ buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
+ continue;
+ }
+ else
+ break;
}
if (P_IGNORE(opaque))