1 files changed, 160 insertions, 125 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 966c0bb532f..bb19c3d2637 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -29,6 +29,7 @@
 #define BTREE_FASTPATH_MIN_LEVEL	2
 
 
+static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate);
 static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
 									  Relation heapRel,
 									  IndexUniqueCheck checkUnique, bool *is_unique,
@@ -84,9 +85,7 @@ _bt_doinsert(Relation rel, IndexTuple itup,
 	bool		is_unique = false;
 	BTInsertStateData insertstate;
 	BTScanInsert itup_key;
-	BTStack		stack = NULL;
-	Buffer		buf;
-	bool		fastpath;
+	BTStack		stack;
 	bool		checkingunique = (checkUnique != UNIQUE_CHECK_NO);
 
 	/* we need an insertion scan key to do our search, so build one */
@@ -137,102 +136,32 @@ _bt_doinsert(Relation rel, IndexTuple itup,
 	insertstate.buf = InvalidBuffer;
 	insertstate.postingoff = 0;
 
+search:
+
 	/*
-	 * It's very common to have an index on an auto-incremented or
-	 * monotonically increasing value. In such cases, every insertion happens
-	 * towards the end of the index. We try to optimize that case by caching
-	 * the right-most leaf of the index. If our cached block is still the
-	 * rightmost leaf, has enough free space to accommodate a new entry and
-	 * the insertion key is strictly greater than the first key in this page,
-	 * then we can safely conclude that the new key will be inserted in the
-	 * cached block. So we simply search within the cached block and insert
-	 * the key at the appropriate location. We call it a fastpath.
-	 *
-	 * Testing has revealed, though, that the fastpath can result in increased
-	 * contention on the exclusive-lock on the rightmost leaf page. So we
-	 * conditionally check if the lock is available. If it's not available
-	 * then we simply abandon the fastpath and take the regular path. This
-	 * makes sense because unavailability of the lock also signals that some
-	 * other backend might be concurrently inserting into the page, thus
-	 * reducing our chances to finding an insertion place in this page.
+	 * Find and lock the leaf page that the tuple should be added to by
+	 * searching from the root page.  insertstate.buf will hold a buffer that
+	 * is locked in exclusive mode afterwards.
 	 */
-top:
-	fastpath = false;
-	if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
-	{
-		Page		page;
-		BTPageOpaque lpageop;
-
-		/*
-		 * Conditionally acquire exclusive lock on the buffer before doing any
-		 * checks. If we don't get the lock, we simply follow slowpath. If we
-		 * do get the lock, this ensures that the index state cannot change,
-		 * as far as the rightmost part of the index is concerned.
-		 */
-		buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
-
-		if (ConditionalLockBuffer(buf))
-		{
-			_bt_checkpage(rel, buf);
-
-			page = BufferGetPage(buf);
-
-			lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
-
-			/*
-			 * Check if the page is still the rightmost leaf page, has enough
-			 * free space to accommodate the new tuple, and the insertion scan
-			 * key is strictly greater than the first key on the page.  Note
-			 * that _bt_insert_parent() has an assertion that catches leaf
-			 * page splits that somehow follow from a fastpath insert.
-			 */
-			if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) &&
-				!P_IGNORE(lpageop) &&
-				PageGetFreeSpace(page) > insertstate.itemsz &&
-				PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) &&
-				_bt_compare(rel, itup_key, page, P_FIRSTDATAKEY(lpageop)) > 0)
-			{
-				fastpath = true;
-			}
-			else
-			{
-				_bt_relbuf(rel, buf);
-
-				/*
-				 * Something did not work out. Just forget about the cached
-				 * block and follow the normal path. It might be set again if
-				 * the conditions are favourable.
-				 */
-				RelationSetTargetBlock(rel, InvalidBlockNumber);
-			}
-		}
-		else
-		{
-			ReleaseBuffer(buf);
-
-			/*
-			 * If someone's holding a lock, it's likely to change anyway, so
-			 * don't try again until we get an updated rightmost leaf.
-			 */
-			RelationSetTargetBlock(rel, InvalidBlockNumber);
-		}
-	}
-
-	if (!fastpath)
-	{
-		/*
-		 * Find the first page containing this key.  Buffer returned by
-		 * _bt_search() is locked in exclusive mode.
-		 */
-		stack = _bt_search(rel, itup_key, &buf, BT_WRITE, NULL);
-	}
-
-	insertstate.buf = buf;
-	buf = InvalidBuffer;		/* insertstate.buf now owns the buffer */
+	stack = _bt_search_insert(rel, &insertstate);
 
 	/*
-	 * If we're not allowing duplicates, make sure the key isn't already in
-	 * the index.
+	 * checkingunique inserts are not allowed to go ahead when two tuples with
+	 * equal key attribute values would be visible to new MVCC snapshots once
+	 * the xact commits.  Check for conflicts in the locked page/buffer (if
+	 * needed) here.
+	 *
+	 * It might be necessary to check a page to the right in _bt_check_unique,
+	 * though that should be very rare.  In practice the first page the value
+	 * could be on (with scantid omitted) is almost always also the only page
+	 * that a matching tuple might be found on.  This is due to the behavior
+	 * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
+	 * only be allowed to cross a page boundary when there is no candidate
+	 * leaf page split point that avoids it.  Also, _bt_check_unique can use
+	 * the leaf page high key to determine that there will be no duplicates on
+	 * the right sibling without actually visiting it (it uses the high key in
+	 * cases where the new item happens to belong at the far right of the leaf
+	 * page).
 	 *
 	 * NOTE: obviously, _bt_check_unique can only detect keys that are already
 	 * in the index; so it cannot defend against concurrent insertions of the
@@ -246,7 +175,7 @@ top:
 	 * insertion.  (This requires some care in _bt_findinsertloc.)
 	 *
 	 * If we must wait for another xact, we release the lock while waiting,
-	 * and then must start over completely.
+	 * and then must perform a new search.
 	 *
 	 * For a partial uniqueness check, we don't wait for the other xact. Just
 	 * let the tuple in and return false for possibly non-unique, or true for
@@ -260,7 +189,7 @@ top:
 		xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
 								 &is_unique, &speculativeToken);
 
-		if (TransactionIdIsValid(xwait))
+		if (unlikely(TransactionIdIsValid(xwait)))
 		{
 			/* Have to wait for the other guy ... */
 			_bt_relbuf(rel, insertstate.buf);
@@ -279,7 +208,7 @@ top:
 			/* start over... */
 			if (stack)
 				_bt_freestack(stack);
-			goto top;
+			goto search;
 		}
 
 		/* Uniqueness is established -- restore heap tid as scantid */
@@ -326,6 +255,112 @@ top:
 }
 
 /*
+ *	_bt_search_insert() -- _bt_search() wrapper for inserts
+ *
+ * Search the tree for a particular scankey, or more precisely for the first
+ * leaf page it could be on.  Try to make use of the fastpath optimization's
+ * rightmost leaf page cache before actually searching the tree from the root
+ * page, though.
+ *
+ * Return value is a stack of parent-page pointers (though see notes about
+ * fastpath optimization and page splits below).  insertstate->buf is set to
+ * the address of the leaf-page buffer, which is write-locked and pinned in
+ * all cases (if necessary by creating a new empty root page for caller).
+ *
+ * The fastpath optimization avoids most of the work of searching the tree
+ * repeatedly when a single backend inserts successive new tuples on the
+ * rightmost leaf page of an index.  A backend cache of the rightmost leaf
+ * page is maintained within _bt_insertonpg(), and used here.  The cache is
+ * invalidated here when an insert of a non-pivot tuple must take place on a
+ * non-rightmost leaf page.
+ *
+ * The optimization helps with indexes on an auto-incremented field.  It also
+ * helps with indexes on datetime columns, as well as indexes with lots of
+ * NULL values.  (NULLs usually get inserted in the rightmost page for single
+ * column indexes, since they usually get treated as coming after everything
+ * else in the key space.  Individual NULL tuples will generally be placed on
+ * the rightmost leaf page due to the influence of the heap TID column.)
+ *
+ * Note that we avoid applying the optimization when there is insufficient
+ * space on the rightmost page to fit caller's new item.  This is necessary
+ * because we'll need to return a real descent stack when a page split is
+ * expected (actually, caller can cope with a leaf page split that uses a NULL
+ * stack, but that's very slow and so must be avoided).  Note also that the
+ * fastpath optimization acquires the lock on the page conditionally as a way
+ * of reducing extra contention when there are concurrent insertions into the
+ * rightmost page (we give up if we'd have to wait for the lock).  We assume
+ * that it isn't useful to apply the optimization when there is contention,
+ * since each per-backend cache won't stay valid for long.
+ */
+static BTStack
+_bt_search_insert(Relation rel, BTInsertState insertstate)
+{
+	Assert(insertstate->buf == InvalidBuffer);
+	Assert(!insertstate->bounds_valid);
+	Assert(insertstate->postingoff == 0);
+
+	if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
+	{
+		/* Simulate a _bt_getbuf() call with conditional locking */
+		insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
+		if (ConditionalLockBuffer(insertstate->buf))
+		{
+			Page		page;
+			BTPageOpaque lpageop;
+
+			_bt_checkpage(rel, insertstate->buf);
+			page = BufferGetPage(insertstate->buf);
+			lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+			/*
+			 * Check if the page is still the rightmost leaf page and has
+			 * enough free space to accommodate the new tuple.  Also check
+			 * that the insertion scan key is strictly greater than the first
+			 * non-pivot tuple on the page.  (Note that we expect itup_key's
+			 * scantid to be unset when our caller is a checkingunique
+			 * inserter.)
+			 */
+			if (P_RIGHTMOST(lpageop) &&
+				P_ISLEAF(lpageop) &&
+				!P_IGNORE(lpageop) &&
+				PageGetFreeSpace(page) > insertstate->itemsz &&
+				PageGetMaxOffsetNumber(page) >= P_HIKEY &&
+				_bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
+			{
+				/*
+				 * Caller can use the fastpath optimization because cached
+				 * block is still rightmost leaf page, which can fit caller's
+				 * new tuple without splitting.  Keep block in local cache for
+				 * next insert, and have caller use NULL stack.
+				 *
+				 * Note that _bt_insert_parent() has an assertion that catches
+				 * leaf page splits that somehow follow from a fastpath insert
+				 * (it should only be passed a NULL stack when it must deal
+				 * with a concurrent root page split, and never because a NULL
+				 * stack was returned here).
+				 */
+				return NULL;
+			}
+
+			/* Page unsuitable for caller, drop lock and pin */
+			_bt_relbuf(rel, insertstate->buf);
+		}
+		else
+		{
+			/* Lock unavailable, drop pin */
+			ReleaseBuffer(insertstate->buf);
+		}
+
+		/* Forget block, since cache doesn't appear to be useful */
+		RelationSetTargetBlock(rel, InvalidBlockNumber);
+	}
+
+	/* Cannot use optimization -- descend tree, return proper descent stack */
+	return _bt_search(rel, insertstate->itup_key, &insertstate->buf, BT_WRITE,
+					  NULL);
+}
+
+/*
  *	_bt_check_unique() -- Check for violation of unique index constraint
  *
  * Returns InvalidTransactionId if there is no conflict, else an xact ID
@@ -1177,10 +1212,12 @@ _bt_insertonpg(Relation rel,
 	}
 	else
 	{
+		bool		isleaf = P_ISLEAF(lpageop);
+		bool		isrightmost = P_RIGHTMOST(lpageop);
 		Buffer		metabuf = InvalidBuffer;
 		Page		metapg = NULL;
 		BTMetaPageData *metad = NULL;
-		BlockNumber cachedBlock = InvalidBlockNumber;
+		BlockNumber blockcache;
 
 		/*
 		 * If we are doing this insert because we split a page that was the
@@ -1191,7 +1228,8 @@ _bt_insertonpg(Relation rel,
 		 */
 		if (split_only_page)
 		{
-			Assert(!P_ISLEAF(lpageop));
+			Assert(!isleaf);
+			Assert(BufferIsValid(cbuf));
 
 			metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
 			metapg = BufferGetPage(metabuf);
@@ -1238,15 +1276,6 @@ _bt_insertonpg(Relation rel,
 			MarkBufferDirty(cbuf);
 		}
 
-		/*
-		 * Cache the block information if we just inserted into the rightmost
-		 * leaf page of the index and it's not the root page.  For very small
-		 * index where root is also the leaf, there is no point trying for any
-		 * optimization.
-		 */
-		if (P_RIGHTMOST(lpageop) && P_ISLEAF(lpageop) && !P_ISROOT(lpageop))
-			cachedBlock = BufferGetBlockNumber(buf);
-
 		/* XLOG stuff */
 		if (RelationNeedsWAL(rel))
 		{
@@ -1260,7 +1289,7 @@ _bt_insertonpg(Relation rel,
 			XLogBeginInsert();
 			XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
 
-			if (P_ISLEAF(lpageop) && postingoff == 0)
+			if (isleaf && postingoff == 0)
 			{
 				/* Simple leaf insert */
 				xlinfo = XLOG_BTREE_INSERT_LEAF;
@@ -1329,36 +1358,42 @@ _bt_insertonpg(Relation rel,
 			recptr = XLogInsert(RM_BTREE_ID, xlinfo);
 
 			if (BufferIsValid(metabuf))
-			{
 				PageSetLSN(metapg, recptr);
-			}
 			if (BufferIsValid(cbuf))
-			{
 				PageSetLSN(BufferGetPage(cbuf), recptr);
-			}
 
 			PageSetLSN(page, recptr);
 		}
 
 		END_CRIT_SECTION();
 
-		/* release buffers */
+		/* Release subsidiary buffers */
 		if (BufferIsValid(metabuf))
 			_bt_relbuf(rel, metabuf);
 		if (BufferIsValid(cbuf))
 			_bt_relbuf(rel, cbuf);
+
+		/*
+		 * Cache the block number if this is the rightmost leaf page.  Cache
+		 * may be used by a future inserter within _bt_search_insert().
+		 */
+		blockcache = InvalidBlockNumber;
+		if (isrightmost && isleaf && !P_ISROOT(lpageop))
+			blockcache = BufferGetBlockNumber(buf);
+
+		/* Release buffer for insertion target block */
 		_bt_relbuf(rel, buf);
 
 		/*
-		 * If we decided to cache the insertion target block, then set it now.
-		 * But before that, check for the height of the tree and don't go for
-		 * the optimization for small indexes. We defer that check to this
-		 * point to ensure that we don't call _bt_getrootheight while holding
-		 * lock on any other block.
+		 * If we decided to cache the insertion target block before releasing
+		 * its buffer lock, then cache it now.  Check the height of the tree
+		 * first, though.  We don't go for the optimization with small
+		 * indexes.  Defer final check to this point to ensure that we don't
+		 * call _bt_getrootheight while holding a buffer lock.
 		 */
-		if (BlockNumberIsValid(cachedBlock) &&
+		if (BlockNumberIsValid(blockcache) &&
 			_bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
-			RelationSetTargetBlock(rel, cachedBlock);
+			RelationSetTargetBlock(rel, blockcache);
 	}
 
 	/* be tidy */
@@ -2054,9 +2089,9 @@ _bt_insert_parent(Relation rel,
 			 * This is more of a performance issue than a correctness issue.
 			 * The fastpath won't have a descent stack.  Using a phony stack
 			 * here works, but never rely on that.  The fastpath should be
-			 * rejected when the rightmost leaf page will split, since it's
-			 * faster to go through _bt_search() and get a stack in the usual
-			 * way.
+			 * rejected within _bt_search_insert() when the rightmost leaf
+			 * page will split, since it's faster to go through _bt_search()
+			 * and get a stack in the usual way.
 			 */
 			Assert(!(P_ISLEAF(lpageop) &&
 					 BlockNumberIsValid(RelationGetTargetBlock(rel))));