1 files changed, 262 insertions, 145 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 1facd0535d8..b2ee0adb502 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -61,14 +61,16 @@ static OffsetNumber _bt_findinsertloc(Relation rel,
 				  BTStack stack,
 				  Relation heapRel);
 static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack);
-static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf,
+static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
+			   Buffer buf,
+			   Buffer cbuf,
 			   BTStack stack,
 			   IndexTuple itup,
 			   OffsetNumber newitemoff,
 			   bool split_only_page);
-static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf,
-		  OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz,
-		  IndexTuple newitem, bool newitemonleft);
+static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
+		  Buffer cbuf, OffsetNumber firstright, OffsetNumber newitemoff,
+		  Size newitemsz, IndexTuple newitem, bool newitemonleft);
 static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
 				  BTStack stack, bool is_root, bool is_only);
 static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
@@ -116,6 +118,9 @@ _bt_doinsert(Relation rel, IndexTuple itup,
 
 	/* we need an insertion scan key to do our search, so build one */
 	itup_key = _bt_mkscankey(rel, itup);
+	/* No scantid until uniqueness established in checkingunique case */
+	if (checkingunique && itup_key->heapkeyspace)
+		itup_key->scantid = NULL;
 
 	/*
 	 * Fill in the BTInsertState working area, to track the current page and
@@ -231,12 +236,13 @@ top:
 	 * NOTE: obviously, _bt_check_unique can only detect keys that are already
 	 * in the index; so it cannot defend against concurrent insertions of the
 	 * same key.  We protect against that by means of holding a write lock on
-	 * the target page.  Any other would-be inserter of the same key must
-	 * acquire a write lock on the same target page, so only one would-be
-	 * inserter can be making the check at one time.  Furthermore, once we are
-	 * past the check we hold write locks continuously until we have performed
-	 * our insertion, so no later inserter can fail to see our insertion.
-	 * (This requires some care in _bt_findinsertloc.)
+	 * the first page the value could be on, regardless of the value of its
+	 * implicit heap TID tiebreaker attribute.  Any other would-be inserter of
+	 * the same key must acquire a write lock on the same page, so only one
+	 * would-be inserter can be making the check at one time.  Furthermore,
+	 * once we are past the check we hold write locks continuously until we
+	 * have performed our insertion, so no later inserter can fail to see our
+	 * insertion.  (This requires some care in _bt_findinsertloc.)
 	 *
 	 * If we must wait for another xact, we release the lock while waiting,
 	 * and then must start over completely.
@@ -274,6 +280,10 @@ top:
 				_bt_freestack(stack);
 			goto top;
 		}
+
+		/* Uniqueness is established -- restore heap tid as scantid */
+		if (itup_key->heapkeyspace)
+			itup_key->scantid = &itup->t_tid;
 	}
 
 	if (checkUnique != UNIQUE_CHECK_EXISTING)
@@ -282,12 +292,11 @@ top:
 
 		/*
 		 * The only conflict predicate locking cares about for indexes is when
-		 * an index tuple insert conflicts with an existing lock.  Since the
-		 * actual location of the insert is hard to predict because of the
-		 * random search used to prevent O(N^2) performance when there are
-		 * many duplicate entries, we can just use the "first valid" page.
-		 * This reasoning also applies to INCLUDE indexes, whose extra
-		 * attributes are not considered part of the key space.
+		 * an index tuple insert conflicts with an existing lock.  We don't
+		 * know the actual page we're going to insert to yet because scantid
+		 * was not filled in initially, but it's okay to use the "first valid"
+		 * page instead.  This reasoning also applies to INCLUDE indexes,
+		 * whose extra attributes are not considered part of the key space.
 		 */
 		CheckForSerializableConflictIn(rel, NULL, insertstate.buf);
 
@@ -298,8 +307,8 @@ top:
 		 */
 		newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
 									   stack, heapRel);
-		_bt_insertonpg(rel, insertstate.buf, InvalidBuffer, stack, itup,
-					   newitemoff, false);
+		_bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
+					   itup, newitemoff, false);
 	}
 	else
 	{
@@ -371,6 +380,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 	 * Scan over all equal tuples, looking for live conflicts.
 	 */
 	Assert(!insertstate->bounds_valid || insertstate->low == offset);
+	Assert(itup_key->scantid == NULL);
 	for (;;)
 	{
 		ItemId		curitemid;
@@ -642,18 +652,21 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 /*
  *	_bt_findinsertloc() -- Finds an insert location for a tuple
  *
- *		On entry, insertstate buffer contains the first legal page the new
- *		tuple could be inserted to.  It is exclusive-locked and pinned by the
- *		caller.
+ *		On entry, insertstate buffer contains the page the new tuple belongs
+ *		on.  It is exclusive-locked and pinned by the caller.
+ *
+ *		If 'checkingunique' is true, the buffer on entry is the first page
+ *		that contains duplicates of the new key.  If there are duplicates on
+ *		multiple pages, the correct insertion position might be some page to
+ *		the right, rather than the first page.  In that case, this function
+ *		moves right to the correct target page.
  *
- *		If the new key is equal to one or more existing keys, we can
- *		legitimately place it anywhere in the series of equal keys --- in fact,
- *		if the new key is equal to the page's "high key" we can place it on
- *		the next page.  If it is equal to the high key, and there's not room
- *		to insert the new tuple on the current page without splitting, then
- *		we can move right hoping to find more free space and avoid a split.
- *		Furthermore, if there's not enough room on a page, we try to make
- *		room by removing any LP_DEAD tuples.
+ *		(In a !heapkeyspace index, there can be multiple pages with the same
+ *		high key, where the new tuple could legitimately be placed on.  In
+ *		that case, the caller passes the first page containing duplicates,
+ *		just like when checkinunique=true.  If that page doesn't have enough
+ *		room for the new tuple, this function moves right, trying to find a
+ *		legal page that does.)
  *
  *		On exit, insertstate buffer contains the chosen insertion page, and
  *		the offset within that page is returned.  If _bt_findinsertloc needed
@@ -663,6 +676,9 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
  *		If insertstate contains cached binary search bounds, we will take
  *		advantage of them.  This avoids repeating comparisons that we made in
  *		_bt_check_unique() already.
+ *
+ *		If there is not enough room on the page for the new tuple, we try to
+ *		make room by removing any LP_DEAD tuples.
  */
 static OffsetNumber
 _bt_findinsertloc(Relation rel,
@@ -677,87 +693,144 @@ _bt_findinsertloc(Relation rel,
 
 	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
 
-	/*
-	 * Check whether the item can fit on a btree page at all. (Eventually, we
-	 * ought to try to apply TOAST methods if not.) We actually need to be
-	 * able to fit three items on every page, so restrict any one item to 1/3
-	 * the per-page available space. Note that at this point, itemsz doesn't
-	 * include the ItemId.
-	 *
-	 * NOTE: if you change this, see also the similar code in _bt_buildadd().
-	 */
-	if (insertstate->itemsz > BTMaxItemSize(page))
-		ereport(ERROR,
-				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
-						insertstate->itemsz, BTMaxItemSize(page),
-						RelationGetRelationName(rel)),
-				 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
-						 "Consider a function index of an MD5 hash of the value, "
-						 "or use full text indexing."),
-				 errtableconstraint(heapRel,
-									RelationGetRelationName(rel))));
+	/* Check 1/3 of a page restriction */
+	if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
+		_bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
+							 insertstate->itup);
 
-	/*----------
-	 * If we will need to split the page to put the item on this page,
-	 * check whether we can put the tuple somewhere to the right,
-	 * instead.  Keep scanning right until we
-	 *		(a) find a page with enough free space,
-	 *		(b) reach the last page where the tuple can legally go, or
-	 *		(c) get tired of searching.
-	 * (c) is not flippant; it is important because if there are many
-	 * pages' worth of equal keys, it's better to split one of the early
-	 * pages than to scan all the way to the end of the run of equal keys
-	 * on every insert.  We implement "get tired" as a random choice,
-	 * since stopping after scanning a fixed number of pages wouldn't work
-	 * well (we'd never reach the right-hand side of previously split
-	 * pages).  Currently the probability of moving right is set at 0.99,
-	 * which may seem too high to change the behavior much, but it does an
-	 * excellent job of preventing O(N^2) behavior with many equal keys.
-	 *----------
-	 */
 	Assert(P_ISLEAF(lpageop) && !P_INCOMPLETE_SPLIT(lpageop));
 	Assert(!insertstate->bounds_valid || checkingunique);
+	Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
+	Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
 
-	while (PageGetFreeSpace(page) < insertstate->itemsz)
+	if (itup_key->heapkeyspace)
 	{
 		/*
-		 * before considering moving right, see if we can obtain enough space
-		 * by erasing LP_DEAD items
+		 * If we're inserting into a unique index, we may have to walk right
+		 * through leaf pages to find the one leaf page that we must insert on
+		 * to.
+		 *
+		 * This is needed for checkingunique callers because a scantid was not
+		 * used when we called _bt_search().  scantid can only be set after
+		 * _bt_check_unique() has checked for duplicates.  The buffer
+		 * initially stored in insertstate->buf has the page where the first
+		 * duplicate key might be found, which isn't always the page that new
+		 * tuple belongs on.  The heap TID attribute for new tuple (scantid)
+		 * could force us to insert on a sibling page, though that should be
+		 * very rare in practice.
 		 */
-		if (P_HAS_GARBAGE(lpageop))
+		if (checkingunique)
 		{
-			_bt_vacuum_one_page(rel, insertstate->buf, heapRel);
-			insertstate->bounds_valid = false;
+			for (;;)
+			{
+				/*
+				 * Does the new tuple belong on this page?
+				 *
+				 * The earlier _bt_check_unique() call may well have
+				 * established a strict upper bound on the offset for the new
+				 * item.  If it's not the last item of the page (i.e. if there
+				 * is at least one tuple on the page that goes after the tuple
+				 * we're inserting) then we know that the tuple belongs on
+				 * this page.  We can skip the high key check.
+				 */
+				if (insertstate->bounds_valid &&
+					insertstate->low <= insertstate->stricthigh &&
+					insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
+					break;
+
+				/* Test '<=', not '!=', since scantid is set now */
+				if (P_RIGHTMOST(lpageop) ||
+					_bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
+					break;
 
-			if (PageGetFreeSpace(page) >= insertstate->itemsz)
-				break;			/* OK, now we have enough space */
+				_bt_stepright(rel, insertstate, stack);
+				/* Update local state after stepping right */
+				page = BufferGetPage(insertstate->buf);
+				lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+			}
 		}
 
 		/*
-		 * Nope, so check conditions (b) and (c) enumerated above
+		 * If the target page is full, see if we can obtain enough space by
+		 * erasing LP_DEAD items
+		 */
+		if (PageGetFreeSpace(page) < insertstate->itemsz &&
+			P_HAS_GARBAGE(lpageop))
+		{
+			_bt_vacuum_one_page(rel, insertstate->buf, heapRel);
+			insertstate->bounds_valid = false;
+		}
+	}
+	else
+	{
+		/*----------
+		 * This is a !heapkeyspace (version 2 or 3) index.  The current page
+		 * is the first page that we could insert the new tuple to, but there
+		 * may be other pages to the right that we could opt to use instead.
 		 *
-		 * The earlier _bt_check_unique() call may well have established a
-		 * strict upper bound on the offset for the new item.  If it's not the
-		 * last item of the page (i.e. if there is at least one tuple on the
-		 * page that's greater than the tuple we're inserting to) then we know
-		 * that the tuple belongs on this page.  We can skip the high key
-		 * check.
+		 * If the new key is equal to one or more existing keys, we can
+		 * legitimately place it anywhere in the series of equal keys.  In
+		 * fact, if the new key is equal to the page's "high key" we can place
+		 * it on the next page.  If it is equal to the high key, and there's
+		 * not room to insert the new tuple on the current page without
+		 * splitting, then we move right hoping to find more free space and
+		 * avoid a split.
+		 *
+		 * Keep scanning right until we
+		 *		(a) find a page with enough free space,
+		 *		(b) reach the last page where the tuple can legally go, or
+		 *		(c) get tired of searching.
+		 * (c) is not flippant; it is important because if there are many
+		 * pages' worth of equal keys, it's better to split one of the early
+		 * pages than to scan all the way to the end of the run of equal keys
+		 * on every insert.  We implement "get tired" as a random choice,
+		 * since stopping after scanning a fixed number of pages wouldn't work
+		 * well (we'd never reach the right-hand side of previously split
+		 * pages).  The probability of moving right is set at 0.99, which may
+		 * seem too high to change the behavior much, but it does an excellent
+		 * job of preventing O(N^2) behavior with many equal keys.
+		 *----------
 		 */
-		if (insertstate->bounds_valid &&
-			insertstate->low <= insertstate->stricthigh &&
-			insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
-			break;
+		while (PageGetFreeSpace(page) < insertstate->itemsz)
+		{
+			/*
+			 * Before considering moving right, see if we can obtain enough
+			 * space by erasing LP_DEAD items
+			 */
+			if (P_HAS_GARBAGE(lpageop))
+			{
+				_bt_vacuum_one_page(rel, insertstate->buf, heapRel);
+				insertstate->bounds_valid = false;
 
-		if (P_RIGHTMOST(lpageop) ||
-			_bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
-			random() <= (MAX_RANDOM_VALUE / 100))
-			break;
+				if (PageGetFreeSpace(page) >= insertstate->itemsz)
+					break;		/* OK, now we have enough space */
+			}
 
-		_bt_stepright(rel, insertstate, stack);
-		/* Update local state after stepping right */
-		page = BufferGetPage(insertstate->buf);
-		lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+			/*
+			 * Nope, so check conditions (b) and (c) enumerated above
+			 *
+			 * The earlier _bt_check_unique() call may well have established a
+			 * strict upper bound on the offset for the new item.  If it's not
+			 * the last item of the page (i.e. if there is at least one tuple
+			 * on the page that's greater than the tuple we're inserting to)
+			 * then we know that the tuple belongs on this page.  We can skip
+			 * the high key check.
+			 */
+			if (insertstate->bounds_valid &&
+				insertstate->low <= insertstate->stricthigh &&
+				insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
+				break;
+
+			if (P_RIGHTMOST(lpageop) ||
+				_bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
+				random() <= (MAX_RANDOM_VALUE / 100))
+				break;
+
+			_bt_stepright(rel, insertstate, stack);
+			/* Update local state after stepping right */
+			page = BufferGetPage(insertstate->buf);
+			lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+		}
 	}
 
 	/*
@@ -778,6 +851,9 @@ _bt_findinsertloc(Relation rel,
  * else someone else's _bt_check_unique scan could fail to see our insertion.
  * Write locks on intermediate dead pages won't do because we don't know when
  * they will get de-linked from the tree.
+ *
+ * This is more aggressive than it needs to be for non-unique !heapkeyspace
+ * indexes.
  */
 static void
 _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
@@ -830,8 +906,9 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
  *
  *		This recursive procedure does the following things:
  *
- *			+  if necessary, splits the target page (making sure that the
- *			   split is equitable as far as post-insert free space goes).
+ *			+  if necessary, splits the target page, using 'itup_key' for
+ *			   suffix truncation on leaf pages (caller passes NULL for
+ *			   non-leaf pages).
  *			+  inserts the tuple.
  *			+  if the page was split, pops the parent stack, and finds the
  *			   right place to insert the new child pointer (by walking
@@ -857,6 +934,7 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
  */
 static void
 _bt_insertonpg(Relation rel,
+			   BTScanInsert itup_key,
 			   Buffer buf,
 			   Buffer cbuf,
 			   BTStack stack,
@@ -879,7 +957,7 @@ _bt_insertonpg(Relation rel,
 		   BTreeTupleGetNAtts(itup, rel) ==
 		   IndexRelationGetNumberOfAttributes(rel));
 	Assert(P_ISLEAF(lpageop) ||
-		   BTreeTupleGetNAtts(itup, rel) ==
+		   BTreeTupleGetNAtts(itup, rel) <=
 		   IndexRelationGetNumberOfKeyAttributes(rel));
 
 	/* The caller should've finished any incomplete splits already. */
@@ -929,8 +1007,8 @@ _bt_insertonpg(Relation rel,
 									  &newitemonleft);
 
 		/* split the buffer into left and right halves */
-		rbuf = _bt_split(rel, buf, cbuf, firstright,
-						 newitemoff, itemsz, itup, newitemonleft);
+		rbuf = _bt_split(rel, itup_key, buf, cbuf, firstright, newitemoff,
+						 itemsz, itup, newitemonleft);
 		PredicateLockPageSplit(rel,
 							   BufferGetBlockNumber(buf),
 							   BufferGetBlockNumber(rbuf));
@@ -1014,7 +1092,7 @@ _bt_insertonpg(Relation rel,
 		if (BufferIsValid(metabuf))
 		{
 			/* upgrade meta-page if needed */
-			if (metad->btm_version < BTREE_VERSION)
+			if (metad->btm_version < BTREE_NOVAC_VERSION)
 				_bt_upgrademetapage(metapg);
 			metad->btm_fastroot = itup_blkno;
 			metad->btm_fastlevel = lpageop->btpo.level;
@@ -1069,6 +1147,8 @@ _bt_insertonpg(Relation rel,
 
 			if (BufferIsValid(metabuf))
 			{
+				Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+				xlmeta.version = metad->btm_version;
 				xlmeta.root = metad->btm_root;
 				xlmeta.level = metad->btm_level;
 				xlmeta.fastroot = metad->btm_fastroot;
@@ -1136,17 +1216,19 @@ _bt_insertonpg(Relation rel,
  *		new right page.  newitemoff etc. tell us about the new item that
  *		must be inserted along with the data from the old page.
  *
- *		When splitting a non-leaf page, 'cbuf' is the left-sibling of the
- *		page we're inserting the downlink for.  This function will clear the
- *		INCOMPLETE_SPLIT flag on it, and release the buffer.
+ *		itup_key is used for suffix truncation on leaf pages (internal
+ *		page callers pass NULL).  When splitting a non-leaf page, 'cbuf'
+ *		is the left-sibling of the page we're inserting the downlink for.
+ *		This function will clear the INCOMPLETE_SPLIT flag on it, and
+ *		release the buffer.
  *
  *		Returns the new right sibling of buf, pinned and write-locked.
  *		The pin and lock on buf are maintained.
  */
 static Buffer
-_bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
-		  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
-		  bool newitemonleft)
+_bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
+		  OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz,
+		  IndexTuple newitem, bool newitemonleft)
 {
 	Buffer		rbuf;
 	Page		origpage;
@@ -1240,7 +1322,8 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 		itemid = PageGetItemId(origpage, P_HIKEY);
 		itemsz = ItemIdGetLength(itemid);
 		item = (IndexTuple) PageGetItem(origpage, itemid);
-		Assert(BTreeTupleGetNAtts(item, rel) == indnkeyatts);
+		Assert(BTreeTupleGetNAtts(item, rel) > 0);
+		Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts);
 		if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
 						false, false) == InvalidOffsetNumber)
 		{
@@ -1254,8 +1337,29 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 
 	/*
 	 * The "high key" for the new left page will be the first key that's going
-	 * to go into the new right page.  This might be either the existing data
-	 * item at position firstright, or the incoming tuple.
+	 * to go into the new right page, or possibly a truncated version if this
+	 * is a leaf page split.  This might be either the existing data item at
+	 * position firstright, or the incoming tuple.
+	 *
+	 * The high key for the left page is formed using the first item on the
+	 * right page, which may seem to be contrary to Lehman & Yao's approach of
+	 * using the left page's last item as its new high key when splitting on
+	 * the leaf level.  It isn't, though: suffix truncation will leave the
+	 * left page's high key fully equal to the last item on the left page when
+	 * two tuples with equal key values (excluding heap TID) enclose the split
+	 * point.  It isn't actually necessary for a new leaf high key to be equal
+	 * to the last item on the left for the L&Y "subtree" invariant to hold.
+	 * It's sufficient to make sure that the new leaf high key is strictly
+	 * less than the first item on the right leaf page, and greater than or
+	 * equal to (not necessarily equal to) the last item on the left leaf
+	 * page.
+	 *
+	 * In other words, when suffix truncation isn't possible, L&Y's exact
+	 * approach to leaf splits is taken.  (Actually, even that is slightly
+	 * inaccurate.  A tuple with all the keys from firstright but the heap TID
+	 * from lastleft will be used as the new high key, since the last left
+	 * tuple could be physically larger despite being opclass-equal in respect
+	 * of all attributes prior to the heap TID attribute.)
 	 */
 	leftoff = P_HIKEY;
 	if (!newitemonleft && newitemoff == firstright)
@@ -1273,25 +1377,48 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 	}
 
 	/*
-	 * Truncate non-key (INCLUDE) attributes of the high key item before
-	 * inserting it on the left page.  This only needs to happen at the leaf
+	 * Truncate unneeded key and non-key attributes of the high key item
+	 * before inserting it on the left page.  This can only happen at the leaf
 	 * level, since in general all pivot tuple values originate from leaf
-	 * level high keys.  This isn't just about avoiding unnecessary work,
-	 * though; truncating unneeded key attributes (more aggressive suffix
-	 * truncation) can only be performed at the leaf level anyway.  This is
-	 * because a pivot tuple in a grandparent page must guide a search not
-	 * only to the correct parent page, but also to the correct leaf page.
+	 * level high keys.  A pivot tuple in a grandparent page must guide a
+	 * search not only to the correct parent page, but also to the correct
+	 * leaf page.
 	 */
-	if (indnatts != indnkeyatts && isleaf)
+	if (isleaf && (itup_key->heapkeyspace || indnatts != indnkeyatts))
 	{
-		lefthikey = _bt_nonkey_truncate(rel, item);
+		IndexTuple	lastleft;
+
+		/*
+		 * Determine which tuple will become the last on the left page.  This
+		 * is needed to decide how many attributes from the first item on the
+		 * right page must remain in new high key for left page.
+		 */
+		if (newitemonleft && newitemoff == firstright)
+		{
+			/* incoming tuple will become last on left page */
+			lastleft = newitem;
+		}
+		else
+		{
+			OffsetNumber lastleftoff;
+
+			/* item just before firstright will become last on left page */
+			lastleftoff = OffsetNumberPrev(firstright);
+			Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
+			itemid = PageGetItemId(origpage, lastleftoff);
+			lastleft = (IndexTuple) PageGetItem(origpage, itemid);
+		}
+
+		Assert(lastleft != item);
+		lefthikey = _bt_truncate(rel, lastleft, item, itup_key);
 		itemsz = IndexTupleSize(lefthikey);
 		itemsz = MAXALIGN(itemsz);
 	}
 	else
 		lefthikey = item;
 
-	Assert(BTreeTupleGetNAtts(lefthikey, rel) == indnkeyatts);
+	Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0);
+	Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts);
 	if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
 					false, false) == InvalidOffsetNumber)
 	{
@@ -1484,7 +1611,6 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 		xl_btree_split xlrec;
 		uint8		xlinfo;
 		XLogRecPtr	recptr;
-		bool		loglhikey = false;
 
 		xlrec.level = ropaque->btpo.level;
 		xlrec.firstright = firstright;
@@ -1513,22 +1639,10 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 		if (newitemonleft)
 			XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
 
-		/* Log left page */
-		if (!isleaf || indnatts != indnkeyatts)
-		{
-			/*
-			 * We must also log the left page's high key.  There are two
-			 * reasons for that: right page's leftmost key is suppressed on
-			 * non-leaf levels and in covering indexes included columns are
-			 * truncated from high keys.  Show it as belonging to the left
-			 * page buffer, so that it is not stored if XLogInsert decides it
-			 * needs a full-page image of the left page.
-			 */
-			itemid = PageGetItemId(origpage, P_HIKEY);
-			item = (IndexTuple) PageGetItem(origpage, itemid);
-			XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
-			loglhikey = true;
-		}
+		/* Log the left page's new high key */
+		itemid = PageGetItemId(origpage, P_HIKEY);
+		item = (IndexTuple) PageGetItem(origpage, itemid);
+		XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
 
 		/*
 		 * Log the contents of the right page in the format understood by
@@ -1544,9 +1658,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 							(char *) rightpage + ((PageHeader) rightpage)->pd_upper,
 							((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
 
-		xlinfo = newitemonleft ?
-			(loglhikey ? XLOG_BTREE_SPLIT_L_HIGHKEY : XLOG_BTREE_SPLIT_L) :
-			(loglhikey ? XLOG_BTREE_SPLIT_R_HIGHKEY : XLOG_BTREE_SPLIT_R);
+		xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
 		recptr = XLogInsert(RM_BTREE_ID, xlinfo);
 
 		PageSetLSN(origpage, recptr);
@@ -1909,7 +2021,7 @@ _bt_insert_parent(Relation rel,
 			_bt_relbuf(rel, pbuf);
 		}
 
-		/* get high key from left page == lower bound for new right page */
+		/* get high key from left, a strict lower bound for new right page */
 		ritem = (IndexTuple) PageGetItem(page,
 										 PageGetItemId(page, P_HIKEY));
 
@@ -1939,7 +2051,7 @@ _bt_insert_parent(Relation rel,
 				 RelationGetRelationName(rel), bknum, rbknum);
 
 		/* Recursively update the parent */
-		_bt_insertonpg(rel, pbuf, buf, stack->bts_parent,
+		_bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
 					   new_item, stack->bts_offset + 1,
 					   is_only);
 
@@ -2200,7 +2312,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	START_CRIT_SECTION();
 
 	/* upgrade metapage if needed */
-	if (metad->btm_version < BTREE_VERSION)
+	if (metad->btm_version < BTREE_NOVAC_VERSION)
 		_bt_upgrademetapage(metapg);
 
 	/* set btree special data */
@@ -2235,7 +2347,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	/*
 	 * insert the right page pointer into the new root page.
 	 */
-	Assert(BTreeTupleGetNAtts(right_item, rel) ==
+	Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
+	Assert(BTreeTupleGetNAtts(right_item, rel) <=
 		   IndexRelationGetNumberOfKeyAttributes(rel));
 	if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
 					false, false) == InvalidOffsetNumber)
@@ -2268,6 +2381,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 		XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
 		XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
 
+		Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+		md.version = metad->btm_version;
 		md.root = rootblknum;
 		md.level = metad->btm_level;
 		md.fastroot = rootblknum;
@@ -2332,6 +2447,7 @@ _bt_pgaddtup(Page page,
 	{
 		trunctuple = *itup;
 		trunctuple.t_info = sizeof(IndexTupleData);
+		/* Deliberately zero INDEX_ALT_TID_MASK bits */
 		BTreeTupleSetNAtts(&trunctuple, 0);
 		itup = &trunctuple;
 		itemsize = sizeof(IndexTupleData);
@@ -2347,8 +2463,8 @@ _bt_pgaddtup(Page page,
 /*
  * _bt_isequal - used in _bt_doinsert in check for duplicates.
  *
- * This is very similar to _bt_compare, except for NULL handling.
- * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
+ * This is very similar to _bt_compare, except for NULL and negative infinity
+ * handling.  Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
  */
 static bool
 _bt_isequal(TupleDesc itupdesc, BTScanInsert itup_key, Page page,
@@ -2361,6 +2477,7 @@ _bt_isequal(TupleDesc itupdesc, BTScanInsert itup_key, Page page,
 	/* Better be comparing to a non-pivot item */
 	Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page)));
 	Assert(offnum >= P_FIRSTDATAKEY((BTPageOpaque) PageGetSpecialPointer(page)));
+	Assert(itup_key->scantid == NULL);
 
 	scankey = itup_key->scankeys;
 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));