aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtinsert.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/nbtree/nbtinsert.c')
-rw-r--r--src/backend/access/nbtree/nbtinsert.c138
1 files changed, 119 insertions, 19 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 2fe98673531..e85abcfd72d 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -23,6 +23,7 @@
#include "miscadmin.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
+#include "storage/smgr.h"
#include "utils/tqual.h"
@@ -85,7 +86,6 @@ static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
int keysz, ScanKey scankey);
static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
-
/*
* _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
*
@@ -111,32 +111,121 @@ _bt_doinsert(Relation rel, IndexTuple itup,
bool is_unique = false;
int natts = rel->rd_rel->relnatts;
ScanKey itup_scankey;
- BTStack stack;
+ BTStack stack = NULL;
Buffer buf;
OffsetNumber offset;
+ bool fastpath;
/* we need an insertion scan key to do our search, so build one */
itup_scankey = _bt_mkscankey(rel, itup);
+ /*
+ * It's very common to have an index on an auto-incremented or
+ * monotonically increasing value. In such cases, every insertion happens
+ * towards the end of the index. We try to optimise that case by caching
+ * the right-most leaf of the index. If our cached block is still the
+ * rightmost leaf, has enough free space to accommodate a new entry and
+ * the insertion key is strictly greater than the first key in this page,
+ * then we can safely conclude that the new key will be inserted in the
+ * cached block. So we simply search within the cached block and insert the
+ * key at the appropriate location. We call it a fastpath.
+ *
+ * Testing has revealed, though, that the fastpath can result in increased
+ * contention on the exclusive-lock on the rightmost leaf page. So we
+ * conditionally check if the lock is available. If it's not available then
+ * we simply abandon the fastpath and take the regular path. This makes
+ * sense because unavailability of the lock also signals that some other
+ * backend might be concurrently inserting into the page, thus reducing our
+ * chances to finding an insertion place in this page.
+ */
top:
- /* find the first page containing this key */
- stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE, NULL);
-
+ fastpath = false;
offset = InvalidOffsetNumber;
+ if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
+ {
+ Size itemsz;
+ Page page;
+ BTPageOpaque lpageop;
- /* trade in our read lock for a write lock */
- LockBuffer(buf, BUFFER_LOCK_UNLOCK);
- LockBuffer(buf, BT_WRITE);
+ /*
+ * Conditionally acquire exclusive lock on the buffer before doing any
+ * checks. If we don't get the lock, we simply follow slowpath. If we
+ * do get the lock, this ensures that the index state cannot change, as
+ * far as the rightmost part of the index is concerned.
+ */
+ buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
- /*
- * If the page was split between the time that we surrendered our read
- * lock and acquired our write lock, then this page may no longer be the
- * right place for the key we want to insert. In this case, we need to
- * move right in the tree. See Lehman and Yao for an excruciatingly
- * precise description.
- */
- buf = _bt_moveright(rel, buf, natts, itup_scankey, false,
- true, stack, BT_WRITE, NULL);
+ if (ConditionalLockBuffer(buf))
+ {
+ _bt_checkpage(rel, buf);
+
+ page = BufferGetPage(buf);
+
+ lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+ itemsz = IndexTupleSize(itup);
+ itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this
+ * but we need to be consistent */
+
+ /*
+ * Check if the page is still the rightmost leaf page, has enough
+ * free space to accommodate the new tuple, no split is in progress
+ * and the scankey is greater than or equal to the first key on the
+ * page.
+ */
+ if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) &&
+ !P_INCOMPLETE_SPLIT(lpageop) &&
+ !P_IGNORE(lpageop) &&
+ (PageGetFreeSpace(page) > itemsz) &&
+ PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) &&
+ _bt_compare(rel, natts, itup_scankey, page,
+ P_FIRSTDATAKEY(lpageop)) > 0)
+ {
+ fastpath = true;
+ }
+ else
+ {
+ _bt_relbuf(rel, buf);
+
+ /*
+ * Something did not workout. Just forget about the cached
+ * block and follow the normal path. It might be set again if
+ * the conditions are favourble.
+ */
+ RelationSetTargetBlock(rel, InvalidBlockNumber);
+ }
+ }
+ else
+ {
+ ReleaseBuffer(buf);
+
+ /*
+ * If someone's holding a lock, it's likely to change anyway,
+ * so don't try again until we get an updated rightmost leaf.
+ */
+ RelationSetTargetBlock(rel, InvalidBlockNumber);
+ }
+ }
+
+ if (!fastpath)
+ {
+ /* find the first page containing this key */
+ stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE,
+ NULL);
+
+ /* trade in our read lock for a write lock */
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ LockBuffer(buf, BT_WRITE);
+
+ /*
+ * If the page was split between the time that we surrendered our read
+ * lock and acquired our write lock, then this page may no longer be
+ * the right place for the key we want to insert. In this case, we
+ * need to move right in the tree. See Lehman and Yao for an
+ * excruciatingly precise description.
+ */
+ buf = _bt_moveright(rel, buf, natts, itup_scankey, false,
+ true, stack, BT_WRITE, NULL);
+ }
/*
* If we're not allowing duplicates, make sure the key isn't already in
@@ -184,7 +273,8 @@ top:
XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
/* start over... */
- _bt_freestack(stack);
+ if (stack)
+ _bt_freestack(stack);
goto top;
}
}
@@ -211,7 +301,8 @@ top:
}
/* be tidy */
- _bt_freestack(stack);
+ if (stack)
+ _bt_freestack(stack);
_bt_freeskey(itup_scankey);
return is_unique;
@@ -879,7 +970,16 @@ _bt_insertonpg(Relation rel,
XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
if (P_ISLEAF(lpageop))
+ {
xlinfo = XLOG_BTREE_INSERT_LEAF;
+
+ /*
+ * Cache the block information if we just inserted into the
+ * rightmost leaf page of the index.
+ */
+ if (P_RIGHTMOST(lpageop))
+ RelationSetTargetBlock(rel, BufferGetBlockNumber(buf));
+ }
else
{
/*