aboutsummaryrefslogtreecommitdiff
path: root/src/include
diff options
context:
space:
mode:
Diffstat (limited to 'src/include')
-rw-r--r--src/include/access/nbtree.h213
-rw-r--r--src/include/access/nbtxlog.h35
2 files changed, 189 insertions, 59 deletions
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 8b3c9dea256..f985a056662 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -112,19 +112,46 @@ typedef struct BTMetaPageData
#define BTPageGetMeta(p) \
((BTMetaPageData *) PageGetContents(p))
+/*
+ * The current Btree version is 4. That's what you'll get when you create
+ * a new index.
+ *
+ * Btree version 3 was used in PostgreSQL v11. It is mostly the same as
+ * version 4, but heap TIDs were not part of the keyspace. Index tuples
+ * with duplicate keys could be stored in any order. We continue to
+ * support reading and writing Btree versions 2 and 3, so that they don't
+ * need to be immediately re-indexed at pg_upgrade. In order to get the
+ * new heapkeyspace semantics, however, a REINDEX is needed.
+ *
+ * Btree version 2 is mostly the same as version 3. There are two new
+ * fields in the metapage that were introduced in version 3. A version 2
+ * metapage will be automatically upgraded to version 3 on the first
+ * insert to it. INCLUDE indexes cannot use version 2.
+ */
#define BTREE_METAPAGE 0 /* first page is meta */
-#define BTREE_MAGIC 0x053162 /* magic number of btree pages */
-#define BTREE_VERSION 3 /* current version number */
+#define BTREE_MAGIC 0x053162 /* magic number in metapage */
+#define BTREE_VERSION 4 /* current version number */
#define BTREE_MIN_VERSION 2 /* minimal supported version number */
+#define BTREE_NOVAC_VERSION 3 /* minimal version with all meta fields */
/*
* Maximum size of a btree index entry, including its tuple header.
*
* We actually need to be able to fit three items on every page,
* so restrict any one item to 1/3 the per-page available space.
+ *
+ * There are rare cases where _bt_truncate() will need to enlarge
+ * a heap index tuple to make space for a tiebreaker heap TID
+ * attribute, which we account for here.
*/
#define BTMaxItemSize(page) \
MAXALIGN_DOWN((PageGetPageSize(page) - \
+ MAXALIGN(SizeOfPageHeaderData + \
+ 3*sizeof(ItemIdData) + \
+ 3*sizeof(ItemPointerData)) - \
+ MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
+#define BTMaxItemSizeNoHeapTid(page) \
+ MAXALIGN_DOWN((PageGetPageSize(page) - \
MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
@@ -166,12 +193,13 @@ typedef struct BTMetaPageData
/*
* Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
- * page. The high key is not a data key, but gives info about what range of
- * keys is supposed to be on this page. The high key on a page is required
- * to be greater than or equal to any data key that appears on the page.
- * If we find ourselves trying to insert a key > high key, we know we need
- * to move right (this should only happen if the page was split since we
- * examined the parent page).
+ * page. The high key is not a tuple that is used to visit the heap. It is
+ * a pivot tuple (see "Notes on B-Tree tuple format" below for definition).
+ * The high key on a page is required to be greater than or equal to any
+ * other key that appears on the page. If we find ourselves trying to
+ * insert a key that is strictly > high key, we know we need to move right
+ * (this should only happen if the page was split since we examined the
+ * parent page).
*
* Our insertion algorithm guarantees that we can use the initial least key
* on our right sibling as the high key. Once a page is created, its high
@@ -187,38 +215,84 @@ typedef struct BTMetaPageData
#define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
/*
+ *
+ * Notes on B-Tree tuple format, and key and non-key attributes:
+ *
* INCLUDE B-Tree indexes have non-key attributes. These are extra
* attributes that may be returned by index-only scans, but do not influence
* the order of items in the index (formally, non-key attributes are not
* considered to be part of the key space). Non-key attributes are only
* present in leaf index tuples whose item pointers actually point to heap
- * tuples. All other types of index tuples (collectively, "pivot" tuples)
- * only have key attributes, since pivot tuples only ever need to represent
- * how the key space is separated. In general, any B-Tree index that has
- * more than one level (i.e. any index that does not just consist of a
- * metapage and a single leaf root page) must have some number of pivot
- * tuples, since pivot tuples are used for traversing the tree.
- *
- * We store the number of attributes present inside pivot tuples by abusing
- * their item pointer offset field, since pivot tuples never need to store a
- * real offset (downlinks only need to store a block number). The offset
- * field only stores the number of attributes when the INDEX_ALT_TID_MASK
- * bit is set (we never assume that pivot tuples must explicitly store the
- * number of attributes, and currently do not bother storing the number of
- * attributes unless indnkeyatts actually differs from indnatts).
- * INDEX_ALT_TID_MASK is only used for pivot tuples at present, though it's
- * possible that it will be used within non-pivot tuples in the future. Do
- * not assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot
- * tuple.
- *
- * The 12 least significant offset bits are used to represent the number of
- * attributes in INDEX_ALT_TID_MASK tuples, leaving 4 bits that are reserved
- * for future use (BT_RESERVED_OFFSET_MASK bits). BT_N_KEYS_OFFSET_MASK should
- * be large enough to store any number <= INDEX_MAX_KEYS.
+ * tuples (non-pivot tuples). _bt_check_natts() enforces the rules
+ * described here.
+ *
+ * Non-pivot tuple format:
+ *
+ * t_tid | t_info | key values | INCLUDE columns, if any
+ *
+ * t_tid points to the heap TID, which is a tiebreaker key column as of
+ * BTREE_VERSION 4. Currently, the INDEX_ALT_TID_MASK status bit is never
+ * set for non-pivot tuples.
+ *
+ * All other types of index tuples ("pivot" tuples) only have key columns,
+ * since pivot tuples only exist to represent how the key space is
+ * separated. In general, any B-Tree index that has more than one level
+ * (i.e. any index that does not just consist of a metapage and a single
+ * leaf root page) must have some number of pivot tuples, since pivot
+ * tuples are used for traversing the tree. Suffix truncation can omit
+ * trailing key columns when a new pivot is formed, which makes minus
+ * infinity their logical value. Since BTREE_VERSION 4 indexes treat heap
+ * TID as a trailing key column that ensures that all index tuples are
+ * physically unique, it is necessary to represent heap TID as a trailing
+ * key column in pivot tuples, though very often this can be truncated
+ * away, just like any other key column. (Actually, the heap TID is
+ * omitted rather than truncated, since its representation is different to
+ * the non-pivot representation.)
+ *
+ * Pivot tuple format:
+ *
+ * t_tid | t_info | key values | [heap TID]
+ *
+ * We store the number of columns present inside pivot tuples by abusing
+ * their t_tid offset field, since pivot tuples never need to store a real
+ * offset (downlinks only need to store a block number in t_tid). The
+ * offset field only stores the number of columns/attributes when the
+ * INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap
+ * TID column sometimes stored in pivot tuples -- that's represented by
+ * the presence of BT_HEAP_TID_ATTR. The INDEX_ALT_TID_MASK bit in t_info
+ * is always set on BTREE_VERSION 4. BT_HEAP_TID_ATTR can only be set on
+ * BTREE_VERSION 4.
+ *
+ * In version 3 indexes, the INDEX_ALT_TID_MASK flag might not be set in
+ * pivot tuples. In that case, the number of key columns is implicitly
+ * the same as the number of key columns in the index. It is not usually
+ * set on version 2 indexes, which predate the introduction of INCLUDE
+ * indexes. (Only explicitly truncated pivot tuples explicitly represent
+ * the number of key columns on versions 2 and 3, whereas all pivot tuples
+ * are formed using truncation on version 4. A version 2 index will have
+ * it set for an internal page negative infinity item iff internal page
+ * split occurred after upgrade to Postgres 11+.)
+ *
+ * The 12 least significant offset bits from t_tid are used to represent
+ * the number of columns in INDEX_ALT_TID_MASK tuples, leaving 4 status
+ * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
+ * future use. BT_N_KEYS_OFFSET_MASK should be large enough to store any
+ * number of columns/attributes <= INDEX_MAX_KEYS.
+ *
+ * Note well: The macros that deal with the number of attributes in tuples
+ * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple,
+ * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot
+ * tuple (or must have the same number of attributes as the index has
+ * generally in the case of !heapkeyspace indexes). They will need to be
+ * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK
+ * for something else.
*/
#define INDEX_ALT_TID_MASK INDEX_AM_RESERVED_BIT
+
+/* Item pointer offset bits */
#define BT_RESERVED_OFFSET_MASK 0xF000
#define BT_N_KEYS_OFFSET_MASK 0x0FFF
+#define BT_HEAP_TID_ATTR 0x1000
/* Get/set downlink block number */
#define BTreeInnerTupleGetDownLink(itup) \
@@ -241,14 +315,16 @@ typedef struct BTMetaPageData
} while(0)
/*
- * Get/set number of attributes within B-tree index tuple. Asserts should be
- * removed when BT_RESERVED_OFFSET_MASK bits will be used.
+ * Get/set number of attributes within B-tree index tuple.
+ *
+ * Note that this does not include an implicit tiebreaker heap-TID
+ * attribute, if any. Note also that the number of key attributes must be
+ * explicitly represented in all heapkeyspace pivot tuples.
*/
#define BTreeTupleGetNAtts(itup, rel) \
( \
(itup)->t_info & INDEX_ALT_TID_MASK ? \
( \
- AssertMacro((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_RESERVED_OFFSET_MASK) == 0), \
ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \
) \
: \
@@ -257,11 +333,35 @@ typedef struct BTMetaPageData
#define BTreeTupleSetNAtts(itup, n) \
do { \
(itup)->t_info |= INDEX_ALT_TID_MASK; \
- Assert(((n) & BT_RESERVED_OFFSET_MASK) == 0); \
ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \
} while(0)
/*
+ * Get tiebreaker heap TID attribute, if any. Macro works with both pivot
+ * and non-pivot tuples, despite differences in how heap TID is represented.
+ */
+#define BTreeTupleGetHeapTID(itup) \
+ ( \
+ (itup)->t_info & INDEX_ALT_TID_MASK && \
+ (ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_HEAP_TID_ATTR) != 0 ? \
+ ( \
+ (ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \
+ sizeof(ItemPointerData)) \
+ ) \
+ : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \
+ )
+/*
+ * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK
+ * representation (currently limited to pivot tuples)
+ */
+#define BTreeTupleSetAltHeapTID(itup) \
+ do { \
+ Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
+ ItemPointerSetOffsetNumber(&(itup)->t_tid, \
+ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_HEAP_TID_ATTR); \
+ } while(0)
+
+/*
* Operator strategy numbers for B-tree have been moved to access/stratnum.h,
* because many places need to use them in ScanKeyInit() calls.
*
@@ -325,20 +425,42 @@ typedef BTStackData *BTStack;
* be confused with a search scankey). It's used to descend a B-Tree using
* _bt_search.
*
+ * heapkeyspace indicates if we expect all keys in the index to be physically
+ * unique because heap TID is used as a tiebreaker attribute, and if index may
+ * have truncated key attributes in pivot tuples. This is actually a property
+ * of the index relation itself (not an indexscan). heapkeyspace indexes are
+ * indexes whose version is >= version 4. It's convenient to keep this close
+ * by, rather than accessing the metapage repeatedly.
+ *
* When nextkey is false (the usual case), _bt_search and _bt_binsrch will
* locate the first item >= scankey. When nextkey is true, they will locate
* the first item > scan key.
*
- * scankeys is an array of scan key entries for attributes that are compared.
- * keysz is the size of the array. During insertion, there must be a scan key
- * for every attribute, but when starting a regular index scan some can be
- * omitted. The array is used as a flexible array member, though it's sized
- * in a way that makes it possible to use stack allocations. See
- * nbtree/README for full details.
+ * pivotsearch is set to true by callers that want to re-find a leaf page
+ * using a scankey built from a leaf page's high key. Most callers set this
+ * to false.
+ *
+ * scantid is the heap TID that is used as a final tiebreaker attribute. It
+ * is set to NULL when index scan doesn't need to find a position for a
+ * specific physical tuple. Must be set when inserting new tuples into
+ * heapkeyspace indexes, since every tuple in the tree unambiguously belongs
+ * in one exact position (it's never set with !heapkeyspace indexes, though).
+ * Despite the representational difference, nbtree search code considers
+ * scantid to be just another insertion scankey attribute.
+ *
+ * scankeys is an array of scan key entries for attributes that are compared
+ * before scantid (user-visible attributes). keysz is the size of the array.
+ * During insertion, there must be a scan key for every attribute, but when
+ * starting a regular index scan some can be omitted. The array is used as a
+ * flexible array member, though it's sized in a way that makes it possible to
+ * use stack allocations. See nbtree/README for full details.
*/
typedef struct BTScanInsertData
{
+ bool heapkeyspace;
bool nextkey;
+ bool pivotsearch;
+ ItemPointer scantid; /* tiebreaker for scankeys */
int keysz; /* Size of scankeys array */
ScanKeyData scankeys[INDEX_MAX_KEYS]; /* Must appear last */
} BTScanInsertData;
@@ -599,6 +721,7 @@ extern void _bt_upgrademetapage(Page page);
extern Buffer _bt_getroot(Relation rel, int access);
extern Buffer _bt_gettrueroot(Relation rel);
extern int _bt_getrootheight(Relation rel);
+extern bool _bt_heapkeyspace(Relation rel);
extern void _bt_checkpage(Relation rel, Buffer buf);
extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
@@ -652,8 +775,12 @@ extern bytea *btoptions(Datum reloptions, bool validate);
extern bool btproperty(Oid index_oid, int attno,
IndexAMProperty prop, const char *propname,
bool *res, bool *isnull);
-extern IndexTuple _bt_nonkey_truncate(Relation rel, IndexTuple itup);
-extern bool _bt_check_natts(Relation rel, Page page, OffsetNumber offnum);
+extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft,
+ IndexTuple firstright, BTScanInsert itup_key);
+extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page,
+ OffsetNumber offnum);
+extern void _bt_check_third_page(Relation rel, Relation heap,
+ bool needheaptidspace, Page page, IndexTuple newtup);
/*
* prototypes for functions in nbtvalidate.c
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index a605851c981..6320a0098ff 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -28,8 +28,7 @@
#define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */
#define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */
#define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */
-#define XLOG_BTREE_SPLIT_L_HIGHKEY 0x50 /* as above, include truncated highkey */
-#define XLOG_BTREE_SPLIT_R_HIGHKEY 0x60 /* as above, include truncated highkey */
+/* 0x50 and 0x60 are unused */
#define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */
#define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */
#define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */
@@ -47,6 +46,7 @@
*/
typedef struct xl_btree_metadata
{
+ uint32 version;
BlockNumber root;
uint32 level;
BlockNumber fastroot;
@@ -80,27 +80,30 @@ typedef struct xl_btree_insert
* whole page image. The left page, however, is handled in the normal
* incremental-update fashion.
*
- * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
- * The _L and _R variants indicate whether the inserted tuple went into the
- * left or right split page (and thus, whether newitemoff and the new item
- * are stored or not). The _HIGHKEY variants indicate that we've logged
- * explicitly left page high key value, otherwise redo should use right page
- * leftmost key as a left page high key. _HIGHKEY is specified for internal
- * pages where right page leftmost key is suppressed, and for leaf pages
- * of covering indexes where high key have non-key attributes truncated.
+ * Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record.
+ * There are two variants to indicate whether the inserted tuple went into the
+ * left or right split page (and thus, whether newitemoff and the new item are
+ * stored or not). We always log the left page high key because suffix
+ * truncation can generate a new leaf high key using user-defined code. This
+ * is also necessary on internal pages, since the first right item that the
+ * left page's high key was based on will have been truncated to zero
+ * attributes in the right page (the original is unavailable from the right
+ * page).
*
* Backup Blk 0: original page / new left page
*
* The left page's data portion contains the new item, if it's the _L variant.
- * (In the _R variants, the new item is one of the right page's tuples.)
- * If level > 0, an IndexTuple representing the HIKEY of the left page
- * follows. We don't need this on leaf pages, because it's the same as the
- * leftmost key in the new right page.
+ * An IndexTuple representing the high key of the left page must follow with
+ * either variant.
*
* Backup Blk 1: new right page
*
- * The right page's data portion contains the right page's tuples in the
- * form used by _bt_restore_page.
+ * The right page's data portion contains the right page's tuples in the form
+ * used by _bt_restore_page. This includes the new item, if it's the _R
+ * variant. The right page's tuples also include the right page's high key
+ * with either variant (moved from the left/original page during the split),
+ * unless the split happened to be of the rightmost page on its level, where
+ * there is no high key for new right page.
*
* Backup Blk 2: next block (orig page's rightlink), if any
* Backup Blk 3: child's left sibling, if non-leaf split