2 files changed, 189 insertions, 59 deletions
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 8b3c9dea256..f985a056662 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -112,19 +112,46 @@ typedef struct BTMetaPageData
 #define BTPageGetMeta(p) \
 	((BTMetaPageData *) PageGetContents(p))
 
+/*
+ * The current Btree version is 4.  That's what you'll get when you create
+ * a new index.
+ *
+ * Btree version 3 was used in PostgreSQL v11.  It is mostly the same as
+ * version 4, but heap TIDs were not part of the keyspace.  Index tuples
+ * with duplicate keys could be stored in any order.  We continue to
+ * support reading and writing Btree versions 2 and 3, so that they don't
+ * need to be immediately re-indexed at pg_upgrade.  In order to get the
+ * new heapkeyspace semantics, however, a REINDEX is needed.
+ *
+ * Btree version 2 is mostly the same as version 3.  There are two new
+ * fields in the metapage that were introduced in version 3.  A version 2
+ * metapage will be automatically upgraded to version 3 on the first
+ * insert to it.  INCLUDE indexes cannot use version 2.
+ */
 #define BTREE_METAPAGE	0		/* first page is meta */
-#define BTREE_MAGIC		0x053162	/* magic number of btree pages */
-#define BTREE_VERSION	3		/* current version number */
+#define BTREE_MAGIC		0x053162	/* magic number in metapage */
+#define BTREE_VERSION	4		/* current version number */
 #define BTREE_MIN_VERSION	2	/* minimal supported version number */
+#define BTREE_NOVAC_VERSION	3	/* minimal version with all meta fields */
 
 /*
  * Maximum size of a btree index entry, including its tuple header.
  *
  * We actually need to be able to fit three items on every page,
  * so restrict any one item to 1/3 the per-page available space.
+ *
+ * There are rare cases where _bt_truncate() will need to enlarge
+ * a heap index tuple to make space for a tiebreaker heap TID
+ * attribute, which we account for here.
  */
 #define BTMaxItemSize(page) \
 	MAXALIGN_DOWN((PageGetPageSize(page) - \
+				   MAXALIGN(SizeOfPageHeaderData + \
+							3*sizeof(ItemIdData)  + \
+							3*sizeof(ItemPointerData)) - \
+				   MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
+#define BTMaxItemSizeNoHeapTid(page) \
+	MAXALIGN_DOWN((PageGetPageSize(page) - \
 				   MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
 				   MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
 
@@ -166,12 +193,13 @@ typedef struct BTMetaPageData
 
 /*
  *	Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
- *	page.  The high key is not a data key, but gives info about what range of
- *	keys is supposed to be on this page.  The high key on a page is required
- *	to be greater than or equal to any data key that appears on the page.
- *	If we find ourselves trying to insert a key > high key, we know we need
- *	to move right (this should only happen if the page was split since we
- *	examined the parent page).
+ *	page.  The high key is not a tuple that is used to visit the heap.  It is
+ *	a pivot tuple (see "Notes on B-Tree tuple format" below for definition).
+ *	The high key on a page is required to be greater than or equal to any
+ *	other key that appears on the page.  If we find ourselves trying to
+ *	insert a key that is strictly > high key, we know we need to move right
+ *	(this should only happen if the page was split since we examined the
+ *	parent page).
  *
  *	Our insertion algorithm guarantees that we can use the initial least key
  *	on our right sibling as the high key.  Once a page is created, its high
@@ -187,38 +215,84 @@ typedef struct BTMetaPageData
 #define P_FIRSTDATAKEY(opaque)	(P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
 
 /*
+ *
+ * Notes on B-Tree tuple format, and key and non-key attributes:
+ *
  * INCLUDE B-Tree indexes have non-key attributes.  These are extra
  * attributes that may be returned by index-only scans, but do not influence
  * the order of items in the index (formally, non-key attributes are not
  * considered to be part of the key space).  Non-key attributes are only
  * present in leaf index tuples whose item pointers actually point to heap
- * tuples.  All other types of index tuples (collectively, "pivot" tuples)
- * only have key attributes, since pivot tuples only ever need to represent
- * how the key space is separated.  In general, any B-Tree index that has
- * more than one level (i.e. any index that does not just consist of a
- * metapage and a single leaf root page) must have some number of pivot
- * tuples, since pivot tuples are used for traversing the tree.
- *
- * We store the number of attributes present inside pivot tuples by abusing
- * their item pointer offset field, since pivot tuples never need to store a
- * real offset (downlinks only need to store a block number).  The offset
- * field only stores the number of attributes when the INDEX_ALT_TID_MASK
- * bit is set (we never assume that pivot tuples must explicitly store the
- * number of attributes, and currently do not bother storing the number of
- * attributes unless indnkeyatts actually differs from indnatts).
- * INDEX_ALT_TID_MASK is only used for pivot tuples at present, though it's
- * possible that it will be used within non-pivot tuples in the future.  Do
- * not assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot
- * tuple.
- *
- * The 12 least significant offset bits are used to represent the number of
- * attributes in INDEX_ALT_TID_MASK tuples, leaving 4 bits that are reserved
- * for future use (BT_RESERVED_OFFSET_MASK bits). BT_N_KEYS_OFFSET_MASK should
- * be large enough to store any number <= INDEX_MAX_KEYS.
+ * tuples (non-pivot tuples).  _bt_check_natts() enforces the rules
+ * described here.
+ *
+ * Non-pivot tuple format:
+ *
+ *  t_tid | t_info | key values | INCLUDE columns, if any
+ *
+ * t_tid points to the heap TID, which is a tiebreaker key column as of
+ * BTREE_VERSION 4.  Currently, the INDEX_ALT_TID_MASK status bit is never
+ * set for non-pivot tuples.
+ *
+ * All other types of index tuples ("pivot" tuples) only have key columns,
+ * since pivot tuples only exist to represent how the key space is
+ * separated.  In general, any B-Tree index that has more than one level
+ * (i.e. any index that does not just consist of a metapage and a single
+ * leaf root page) must have some number of pivot tuples, since pivot
+ * tuples are used for traversing the tree.  Suffix truncation can omit
+ * trailing key columns when a new pivot is formed, which makes minus
+ * infinity their logical value.  Since BTREE_VERSION 4 indexes treat heap
+ * TID as a trailing key column that ensures that all index tuples are
+ * physically unique, it is necessary to represent heap TID as a trailing
+ * key column in pivot tuples, though very often this can be truncated
+ * away, just like any other key column. (Actually, the heap TID is
+ * omitted rather than truncated, since its representation is different to
+ * the non-pivot representation.)
+ *
+ * Pivot tuple format:
+ *
+ *  t_tid | t_info | key values | [heap TID]
+ *
+ * We store the number of columns present inside pivot tuples by abusing
+ * their t_tid offset field, since pivot tuples never need to store a real
+ * offset (downlinks only need to store a block number in t_tid).  The
+ * offset field only stores the number of columns/attributes when the
+ * INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap
+ * TID column sometimes stored in pivot tuples -- that's represented by
+ * the presence of BT_HEAP_TID_ATTR.  The INDEX_ALT_TID_MASK bit in t_info
+ * is always set on BTREE_VERSION 4.  BT_HEAP_TID_ATTR can only be set on
+ * BTREE_VERSION 4.
+ *
+ * In version 3 indexes, the INDEX_ALT_TID_MASK flag might not be set in
+ * pivot tuples.  In that case, the number of key columns is implicitly
+ * the same as the number of key columns in the index.  It is not usually
+ * set on version 2 indexes, which predate the introduction of INCLUDE
+ * indexes.  (Only explicitly truncated pivot tuples explicitly represent
+ * the number of key columns on versions 2 and 3, whereas all pivot tuples
+ * are formed using truncation on version 4.  A version 2 index will have
+ * it set for an internal page negative infinity item iff internal page
+ * split occurred after upgrade to Postgres 11+.)
+ *
+ * The 12 least significant offset bits from t_tid are used to represent
+ * the number of columns in INDEX_ALT_TID_MASK tuples, leaving 4 status
+ * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
+ * future use.  BT_N_KEYS_OFFSET_MASK should be large enough to store any
+ * number of columns/attributes <= INDEX_MAX_KEYS.
+ *
+ * Note well: The macros that deal with the number of attributes in tuples
+ * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple,
+ * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot
+ * tuple (or must have the same number of attributes as the index has
+ * generally in the case of !heapkeyspace indexes).  They will need to be
+ * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK
+ * for something else.
  */
 #define INDEX_ALT_TID_MASK			INDEX_AM_RESERVED_BIT
+
+/* Item pointer offset bits */
 #define BT_RESERVED_OFFSET_MASK		0xF000
 #define BT_N_KEYS_OFFSET_MASK		0x0FFF
+#define BT_HEAP_TID_ATTR			0x1000
 
 /* Get/set downlink block number */
 #define BTreeInnerTupleGetDownLink(itup) \
@@ -241,14 +315,16 @@ typedef struct BTMetaPageData
 	} while(0)
 
 /*
- * Get/set number of attributes within B-tree index tuple. Asserts should be
- * removed when BT_RESERVED_OFFSET_MASK bits will be used.
+ * Get/set number of attributes within B-tree index tuple.
+ *
+ * Note that this does not include an implicit tiebreaker heap-TID
+ * attribute, if any.  Note also that the number of key attributes must be
+ * explicitly represented in all heapkeyspace pivot tuples.
  */
 #define BTreeTupleGetNAtts(itup, rel)	\
 	( \
 		(itup)->t_info & INDEX_ALT_TID_MASK ? \
 		( \
-			AssertMacro((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_RESERVED_OFFSET_MASK) == 0), \
 			ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \
 		) \
 		: \
@@ -257,11 +333,35 @@ typedef struct BTMetaPageData
 #define BTreeTupleSetNAtts(itup, n) \
 	do { \
 		(itup)->t_info |= INDEX_ALT_TID_MASK; \
-		Assert(((n) & BT_RESERVED_OFFSET_MASK) == 0); \
 		ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \
 	} while(0)
 
 /*
+ * Get tiebreaker heap TID attribute, if any.  Macro works with both pivot
+ * and non-pivot tuples, despite differences in how heap TID is represented.
+ */
+#define BTreeTupleGetHeapTID(itup) \
+	( \
+	  (itup)->t_info & INDEX_ALT_TID_MASK && \
+	  (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_HEAP_TID_ATTR) != 0 ? \
+	  ( \
+		(ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \
+					   sizeof(ItemPointerData)) \
+	  ) \
+	  : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \
+	)
+/*
+ * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK
+ * representation (currently limited to pivot tuples)
+ */
+#define BTreeTupleSetAltHeapTID(itup) \
+	do { \
+		Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
+		ItemPointerSetOffsetNumber(&(itup)->t_tid, \
+								   ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_HEAP_TID_ATTR); \
+	} while(0)
+
+/*
  *	Operator strategy numbers for B-tree have been moved to access/stratnum.h,
  *	because many places need to use them in ScanKeyInit() calls.
  *
@@ -325,20 +425,42 @@ typedef BTStackData *BTStack;
  * be confused with a search scankey).  It's used to descend a B-Tree using
  * _bt_search.
  *
+ * heapkeyspace indicates if we expect all keys in the index to be physically
+ * unique because heap TID is used as a tiebreaker attribute, and if index may
+ * have truncated key attributes in pivot tuples.  This is actually a property
+ * of the index relation itself (not an indexscan).  heapkeyspace indexes are
+ * indexes whose version is >= version 4.  It's convenient to keep this close
+ * by, rather than accessing the metapage repeatedly.
+ *
  * When nextkey is false (the usual case), _bt_search and _bt_binsrch will
  * locate the first item >= scankey.  When nextkey is true, they will locate
  * the first item > scan key.
  *
- * scankeys is an array of scan key entries for attributes that are compared.
- * keysz is the size of the array.  During insertion, there must be a scan key
- * for every attribute, but when starting a regular index scan some can be
- * omitted.  The array is used as a flexible array member, though it's sized
- * in a way that makes it possible to use stack allocations.  See
- * nbtree/README for full details.
+ * pivotsearch is set to true by callers that want to re-find a leaf page
+ * using a scankey built from a leaf page's high key.  Most callers set this
+ * to false.
+ *
+ * scantid is the heap TID that is used as a final tiebreaker attribute.  It
+ * is set to NULL when index scan doesn't need to find a position for a
+ * specific physical tuple.  Must be set when inserting new tuples into
+ * heapkeyspace indexes, since every tuple in the tree unambiguously belongs
+ * in one exact position (it's never set with !heapkeyspace indexes, though).
+ * Despite the representational difference, nbtree search code considers
+ * scantid to be just another insertion scankey attribute.
+ *
+ * scankeys is an array of scan key entries for attributes that are compared
+ * before scantid (user-visible attributes).  keysz is the size of the array.
+ * During insertion, there must be a scan key for every attribute, but when
+ * starting a regular index scan some can be omitted.  The array is used as a
+ * flexible array member, though it's sized in a way that makes it possible to
+ * use stack allocations.  See nbtree/README for full details.
  */
 typedef struct BTScanInsertData
 {
+	bool		heapkeyspace;
 	bool		nextkey;
+	bool		pivotsearch;
+	ItemPointer scantid;		/* tiebreaker for scankeys */
 	int			keysz;			/* Size of scankeys array */
 	ScanKeyData scankeys[INDEX_MAX_KEYS];	/* Must appear last */
 } BTScanInsertData;
@@ -599,6 +721,7 @@ extern void _bt_upgrademetapage(Page page);
 extern Buffer _bt_getroot(Relation rel, int access);
 extern Buffer _bt_gettrueroot(Relation rel);
 extern int	_bt_getrootheight(Relation rel);
+extern bool _bt_heapkeyspace(Relation rel);
 extern void _bt_checkpage(Relation rel, Buffer buf);
 extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
 extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
@@ -652,8 +775,12 @@ extern bytea *btoptions(Datum reloptions, bool validate);
 extern bool btproperty(Oid index_oid, int attno,
 		   IndexAMProperty prop, const char *propname,
 		   bool *res, bool *isnull);
-extern IndexTuple _bt_nonkey_truncate(Relation rel, IndexTuple itup);
-extern bool _bt_check_natts(Relation rel, Page page, OffsetNumber offnum);
+extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft,
+			 IndexTuple firstright, BTScanInsert itup_key);
+extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page,
+				OffsetNumber offnum);
+extern void _bt_check_third_page(Relation rel, Relation heap,
+					 bool needheaptidspace, Page page, IndexTuple newtup);
 
 /*
  * prototypes for functions in nbtvalidate.c
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index a605851c981..6320a0098ff 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -28,8 +28,7 @@
 #define XLOG_BTREE_INSERT_META	0x20	/* same, plus update metapage */
 #define XLOG_BTREE_SPLIT_L		0x30	/* add index tuple with split */
 #define XLOG_BTREE_SPLIT_R		0x40	/* as above, new item on right */
-#define XLOG_BTREE_SPLIT_L_HIGHKEY 0x50 /* as above, include truncated highkey */
-#define XLOG_BTREE_SPLIT_R_HIGHKEY 0x60 /* as above, include truncated highkey */
+/* 0x50 and 0x60 are unused */
 #define XLOG_BTREE_DELETE		0x70	/* delete leaf index tuples for a page */
 #define XLOG_BTREE_UNLINK_PAGE	0x80	/* delete a half-dead page */
 #define XLOG_BTREE_UNLINK_PAGE_META 0x90	/* same, and update metapage */
@@ -47,6 +46,7 @@
  */
 typedef struct xl_btree_metadata
 {
+	uint32		version;
 	BlockNumber root;
 	uint32		level;
 	BlockNumber fastroot;
@@ -80,27 +80,30 @@ typedef struct xl_btree_insert
  * whole page image.  The left page, however, is handled in the normal
  * incremental-update fashion.
  *
- * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
- * The _L and _R variants indicate whether the inserted tuple went into the
- * left or right split page (and thus, whether newitemoff and the new item
- * are stored or not).  The _HIGHKEY variants indicate that we've logged
- * explicitly left page high key value, otherwise redo should use right page
- * leftmost key as a left page high key.  _HIGHKEY is specified for internal
- * pages where right page leftmost key is suppressed, and for leaf pages
- * of covering indexes where high key have non-key attributes truncated.
+ * Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record.
+ * There are two variants to indicate whether the inserted tuple went into the
+ * left or right split page (and thus, whether newitemoff and the new item are
+ * stored or not).  We always log the left page high key because suffix
+ * truncation can generate a new leaf high key using user-defined code.  This
+ * is also necessary on internal pages, since the first right item that the
+ * left page's high key was based on will have been truncated to zero
+ * attributes in the right page (the original is unavailable from the right
+ * page).
  *
  * Backup Blk 0: original page / new left page
  *
  * The left page's data portion contains the new item, if it's the _L variant.
- * (In the _R variants, the new item is one of the right page's tuples.)
- * If level > 0, an IndexTuple representing the HIKEY of the left page
- * follows.  We don't need this on leaf pages, because it's the same as the
- * leftmost key in the new right page.
+ * An IndexTuple representing the high key of the left page must follow with
+ * either variant.
  *
  * Backup Blk 1: new right page
  *
- * The right page's data portion contains the right page's tuples in the
- * form used by _bt_restore_page.
+ * The right page's data portion contains the right page's tuples in the form
+ * used by _bt_restore_page.  This includes the new item, if it's the _R
+ * variant.  The right page's tuples also include the right page's high key
+ * with either variant (moved from the left/original page during the split),
+ * unless the split happened to be of the rightmost page on its level, where
+ * there is no high key for new right page.
  *
  * Backup Blk 2: next block (orig page's rightlink), if any
  * Backup Blk 3: child's left sibling, if non-leaf split