1 files changed, 162 insertions, 34 deletions
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index af07732eabc..54afa6f4176 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -81,7 +81,10 @@ static int	_bt_keep_natts(Relation rel, IndexTuple lastleft,
  *		determine whether or not the keys in the index are expected to be
  *		unique (i.e. if this is a "heapkeyspace" index).  We assume a
  *		heapkeyspace index when caller passes a NULL tuple, allowing index
- *		build callers to avoid accessing the non-existent metapage.
+ *		build callers to avoid accessing the non-existent metapage.  We
+ *		also assume that the index is _not_ allequalimage when a NULL tuple
+ *		is passed; CREATE INDEX callers call _bt_allequalimage() to set the
+ *		field themselves.
  */
 BTScanInsert
 _bt_mkscankey(Relation rel, IndexTuple itup)
@@ -108,7 +111,14 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
 	 */
 	key = palloc(offsetof(BTScanInsertData, scankeys) +
 				 sizeof(ScanKeyData) * indnkeyatts);
-	key->heapkeyspace = itup == NULL || _bt_heapkeyspace(rel);
+	if (itup)
+		_bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage);
+	else
+	{
+		/* Utility statement callers can set these fields themselves */
+		key->heapkeyspace = true;
+		key->allequalimage = false;
+	}
 	key->anynullkeys = false;	/* initial assumption */
 	key->nextkey = false;
 	key->pivotsearch = false;
@@ -1374,6 +1384,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
 			 * attribute passes the qual.
 			 */
 			Assert(ScanDirectionIsForward(dir));
+			Assert(BTreeTupleIsPivot(tuple));
 			continue;
 		}
 
@@ -1535,6 +1546,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
 			 * attribute passes the qual.
 			 */
 			Assert(ScanDirectionIsForward(dir));
+			Assert(BTreeTupleIsPivot(tuple));
 			cmpresult = 0;
 			if (subkey->sk_flags & SK_ROW_END)
 				break;
@@ -1774,10 +1786,65 @@ _bt_killitems(IndexScanDesc scan)
 		{
 			ItemId		iid = PageGetItemId(page, offnum);
 			IndexTuple	ituple = (IndexTuple) PageGetItem(page, iid);
+			bool		killtuple = false;
+
+			if (BTreeTupleIsPosting(ituple))
+			{
+				int			pi = i + 1;
+				int			nposting = BTreeTupleGetNPosting(ituple);
+				int			j;
+
+				/*
+				 * Note that we rely on the assumption that heap TIDs in the
+				 * scanpos items array are always in ascending heap TID order
+				 * within a posting list
+				 */
+				for (j = 0; j < nposting; j++)
+				{
+					ItemPointer item = BTreeTupleGetPostingN(ituple, j);
+
+					if (!ItemPointerEquals(item, &kitem->heapTid))
+						break;	/* out of posting list loop */
+
+					/* kitem must have matching offnum when heap TIDs match */
+					Assert(kitem->indexOffset == offnum);
+
+					/*
+					 * Read-ahead to later kitems here.
+					 *
+					 * We rely on the assumption that not advancing kitem here
+					 * will prevent us from considering the posting list tuple
+					 * fully dead by not matching its next heap TID in next
+					 * loop iteration.
+					 *
+					 * If, on the other hand, this is the final heap TID in
+					 * the posting list tuple, then tuple gets killed
+					 * regardless (i.e. we handle the case where the last
+					 * kitem is also the last heap TID in the last index tuple
+					 * correctly -- posting tuple still gets killed).
+					 */
+					if (pi < numKilled)
+						kitem = &so->currPos.items[so->killedItems[pi++]];
+				}
+
+				/*
+				 * Don't bother advancing the outermost loop's int iterator to
+				 * avoid processing killed items that relate to the same
+				 * offnum/posting list tuple.  This micro-optimization hardly
+				 * seems worth it.  (Further iterations of the outermost loop
+				 * will fail to match on this same posting list's first heap
+				 * TID instead, so we'll advance to the next offnum/index
+				 * tuple pretty quickly.)
+				 */
+				if (j == nposting)
+					killtuple = true;
+			}
+			else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
+				killtuple = true;
 
-			if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
+			if (killtuple)
 			{
-				/* found the item */
+				/* found the item/all posting list items */
 				ItemIdMarkDead(iid);
 				killedsomething = true;
 				break;			/* out of inner search loop */
@@ -2018,7 +2085,9 @@ btoptions(Datum reloptions, bool validate)
 	static const relopt_parse_elt tab[] = {
 		{"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
 		{"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
-		offsetof(BTOptions, vacuum_cleanup_index_scale_factor)}
+		offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
+		{"deduplicate_items", RELOPT_TYPE_BOOL,
+		offsetof(BTOptions, deduplicate_items)}
 
 	};
 
@@ -2119,11 +2188,10 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	Size		newsize;
 
 	/*
-	 * We should only ever truncate leaf index tuples.  It's never okay to
-	 * truncate a second time.
+	 * We should only ever truncate non-pivot tuples from leaf pages.  It's
+	 * never okay to truncate when splitting an internal page.
 	 */
-	Assert(BTreeTupleGetNAtts(lastleft, rel) == natts);
-	Assert(BTreeTupleGetNAtts(firstright, rel) == natts);
+	Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright));
 
 	/* Determine how many attributes must be kept in truncated tuple */
 	keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
@@ -2139,6 +2207,19 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 
 		pivot = index_truncate_tuple(itupdesc, firstright, keepnatts);
 
+		if (BTreeTupleIsPosting(pivot))
+		{
+			/*
+			 * index_truncate_tuple() just returns a straight copy of
+			 * firstright when it has no key attributes to truncate.  We need
+			 * to truncate away the posting list ourselves.
+			 */
+			Assert(keepnatts == nkeyatts);
+			Assert(natts == nkeyatts);
+			pivot->t_info &= ~INDEX_SIZE_MASK;
+			pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
+		}
+
 		/*
 		 * If there is a distinguishing key attribute within new pivot tuple,
 		 * there is no need to add an explicit heap TID attribute
@@ -2155,6 +2236,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 		 * attribute to the new pivot tuple.
 		 */
 		Assert(natts != nkeyatts);
+		Assert(!BTreeTupleIsPosting(lastleft) &&
+			   !BTreeTupleIsPosting(firstright));
 		newsize = IndexTupleSize(pivot) + MAXALIGN(sizeof(ItemPointerData));
 		tidpivot = palloc0(newsize);
 		memcpy(tidpivot, pivot, IndexTupleSize(pivot));
@@ -2172,6 +2255,19 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 		newsize = IndexTupleSize(firstright) + MAXALIGN(sizeof(ItemPointerData));
 		pivot = palloc0(newsize);
 		memcpy(pivot, firstright, IndexTupleSize(firstright));
+
+		if (BTreeTupleIsPosting(firstright))
+		{
+			/*
+			 * New pivot tuple was copied from firstright, which happens to be
+			 * a posting list tuple.  We will have to include the max lastleft
+			 * heap TID in the final pivot tuple, but we can remove the
+			 * posting list now. (Pivot tuples should never contain a posting
+			 * list.)
+			 */
+			newsize = MAXALIGN(BTreeTupleGetPostingOffset(firstright)) +
+				MAXALIGN(sizeof(ItemPointerData));
+		}
 	}
 
 	/*
@@ -2199,7 +2295,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 */
 	pivotheaptid = (ItemPointer) ((char *) pivot + newsize -
 								  sizeof(ItemPointerData));
-	ItemPointerCopy(&lastleft->t_tid, pivotheaptid);
+	ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);
 
 	/*
 	 * Lehman and Yao require that the downlink to the right page, which is to
@@ -2210,9 +2306,12 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 * tiebreaker.
 	 */
 #ifndef DEBUG_NO_TRUNCATE
-	Assert(ItemPointerCompare(&lastleft->t_tid, &firstright->t_tid) < 0);
-	Assert(ItemPointerCompare(pivotheaptid, &lastleft->t_tid) >= 0);
-	Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+	Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft),
+							  BTreeTupleGetHeapTID(firstright)) < 0);
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(lastleft)) >= 0);
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(firstright)) < 0);
 #else
 
 	/*
@@ -2225,7 +2324,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 * attribute values along with lastleft's heap TID value when lastleft's
 	 * TID happens to be greater than firstright's TID.
 	 */
-	ItemPointerCopy(&firstright->t_tid, pivotheaptid);
+	ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
 
 	/*
 	 * Pivot heap TID should never be fully equal to firstright.  Note that
@@ -2234,7 +2333,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 */
 	ItemPointerSetOffsetNumber(pivotheaptid,
 							   OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
-	Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(firstright)) < 0);
 #endif
 
 	BTreeTupleSetNAtts(pivot, nkeyatts);
@@ -2301,6 +2401,13 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 		keepnatts++;
 	}
 
+	/*
+	 * Assert that _bt_keep_natts_fast() agrees with us in passing.  This is
+	 * expected in an allequalimage index.
+	 */
+	Assert(!itup_key->allequalimage ||
+		   keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright));
+
 	return keepnatts;
 }
 
@@ -2315,13 +2422,16 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
  * The approach taken here usually provides the same answer as _bt_keep_natts
  * will (for the same pair of tuples from a heapkeyspace index), since the
  * majority of btree opclasses can never indicate that two datums are equal
- * unless they're bitwise equal after detoasting.
+ * unless they're bitwise equal after detoasting.  When an index only has
+ * "equal image" columns, routine is guaranteed to give the same result as
+ * _bt_keep_natts would.
  *
- * These issues must be acceptable to callers, typically because they're only
- * concerned about making suffix truncation as effective as possible without
- * leaving excessive amounts of free space on either side of page split.
  * Callers can rely on the fact that attributes considered equal here are
- * definitely also equal according to _bt_keep_natts.
+ * definitely also equal according to _bt_keep_natts, even when the index uses
+ * an opclass or collation that is not "allequalimage"/deduplication-safe.
+ * This weaker guarantee is good enough for nbtsplitloc.c caller, since false
+ * negatives generally only have the effect of making leaf page splits use a
+ * more balanced split point.
  */
 int
 _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
@@ -2393,28 +2503,42 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 	 * Mask allocated for number of keys in index tuple must be able to fit
 	 * maximum possible number of index attributes
 	 */
-	StaticAssertStmt(BT_N_KEYS_OFFSET_MASK >= INDEX_MAX_KEYS,
-					 "BT_N_KEYS_OFFSET_MASK can't fit INDEX_MAX_KEYS");
+	StaticAssertStmt(BT_OFFSET_MASK >= INDEX_MAX_KEYS,
+					 "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS");
 
 	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
 	tupnatts = BTreeTupleGetNAtts(itup, rel);
 
+	/* !heapkeyspace indexes do not support deduplication */
+	if (!heapkeyspace && BTreeTupleIsPosting(itup))
+		return false;
+
+	/* Posting list tuples should never have "pivot heap TID" bit set */
+	if (BTreeTupleIsPosting(itup) &&
+		(ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
+		 BT_PIVOT_HEAP_TID_ATTR) != 0)
+		return false;
+
+	/* INCLUDE indexes do not support deduplication */
+	if (natts != nkeyatts && BTreeTupleIsPosting(itup))
+		return false;
+
 	if (P_ISLEAF(opaque))
 	{
 		if (offnum >= P_FIRSTDATAKEY(opaque))
 		{
 			/*
-			 * Non-pivot tuples currently never use alternative heap TID
-			 * representation -- even those within heapkeyspace indexes
+			 * Non-pivot tuple should never be explicitly marked as a pivot
+			 * tuple
 			 */
-			if ((itup->t_info & INDEX_ALT_TID_MASK) != 0)
+			if (BTreeTupleIsPivot(itup))
 				return false;
 
 			/*
 			 * Leaf tuples that are not the page high key (non-pivot tuples)
 			 * should never be truncated.  (Note that tupnatts must have been
-			 * inferred, rather than coming from an explicit on-disk
-			 * representation.)
+			 * inferred, even with a posting list tuple, because only pivot
+			 * tuples store tupnatts directly.)
 			 */
 			return tupnatts == natts;
 		}
@@ -2458,12 +2582,12 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 			 * non-zero, or when there is no explicit representation and the
 			 * tuple is evidently not a pre-pg_upgrade tuple.
 			 *
-			 * Prior to v11, downlinks always had P_HIKEY as their offset. Use
-			 * that to decide if the tuple is a pre-v11 tuple.
+			 * Prior to v11, downlinks always had P_HIKEY as their offset.
+			 * Accept that as an alternative indication of a valid
+			 * !heapkeyspace negative infinity tuple.
 			 */
 			return tupnatts == 0 ||
-				((itup->t_info & INDEX_ALT_TID_MASK) == 0 &&
-				 ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
+				ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY;
 		}
 		else
 		{
@@ -2489,7 +2613,11 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 	 * heapkeyspace index pivot tuples, regardless of whether or not there are
 	 * non-key attributes.
 	 */
-	if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+	if (!BTreeTupleIsPivot(itup))
+		return false;
+
+	/* Pivot tuple should not use posting list representation (redundant) */
+	if (BTreeTupleIsPosting(itup))
 		return false;
 
 	/*
@@ -2559,8 +2687,8 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
 					BTMaxItemSizeNoHeapTid(page),
 					RelationGetRelationName(rel)),
 			 errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
-					   ItemPointerGetBlockNumber(&newtup->t_tid),
-					   ItemPointerGetOffsetNumber(&newtup->t_tid),
+					   ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)),
+					   ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)),
 					   RelationGetRelationName(heap)),
 			 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
 					 "Consider a function index of an MD5 hash of the value, "