aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtinsert.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/nbtree/nbtinsert.c')
-rw-r--r--src/backend/access/nbtree/nbtinsert.c407
1 files changed, 262 insertions, 145 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 1facd0535d8..b2ee0adb502 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -61,14 +61,16 @@ static OffsetNumber _bt_findinsertloc(Relation rel,
BTStack stack,
Relation heapRel);
static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack);
-static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf,
+static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
+ Buffer buf,
+ Buffer cbuf,
BTStack stack,
IndexTuple itup,
OffsetNumber newitemoff,
bool split_only_page);
-static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf,
- OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz,
- IndexTuple newitem, bool newitemonleft);
+static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
+ Buffer cbuf, OffsetNumber firstright, OffsetNumber newitemoff,
+ Size newitemsz, IndexTuple newitem, bool newitemonleft);
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
BTStack stack, bool is_root, bool is_only);
static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
@@ -116,6 +118,9 @@ _bt_doinsert(Relation rel, IndexTuple itup,
/* we need an insertion scan key to do our search, so build one */
itup_key = _bt_mkscankey(rel, itup);
+ /* No scantid until uniqueness established in checkingunique case */
+ if (checkingunique && itup_key->heapkeyspace)
+ itup_key->scantid = NULL;
/*
* Fill in the BTInsertState working area, to track the current page and
@@ -231,12 +236,13 @@ top:
* NOTE: obviously, _bt_check_unique can only detect keys that are already
* in the index; so it cannot defend against concurrent insertions of the
* same key. We protect against that by means of holding a write lock on
- * the target page. Any other would-be inserter of the same key must
- * acquire a write lock on the same target page, so only one would-be
- * inserter can be making the check at one time. Furthermore, once we are
- * past the check we hold write locks continuously until we have performed
- * our insertion, so no later inserter can fail to see our insertion.
- * (This requires some care in _bt_findinsertloc.)
+ * the first page the value could be on, regardless of the value of its
+ * implicit heap TID tiebreaker attribute. Any other would-be inserter of
+ * the same key must acquire a write lock on the same page, so only one
+ * would-be inserter can be making the check at one time. Furthermore,
+ * once we are past the check we hold write locks continuously until we
+ * have performed our insertion, so no later inserter can fail to see our
+ * insertion. (This requires some care in _bt_findinsertloc.)
*
* If we must wait for another xact, we release the lock while waiting,
* and then must start over completely.
@@ -274,6 +280,10 @@ top:
_bt_freestack(stack);
goto top;
}
+
+ /* Uniqueness is established -- restore heap tid as scantid */
+ if (itup_key->heapkeyspace)
+ itup_key->scantid = &itup->t_tid;
}
if (checkUnique != UNIQUE_CHECK_EXISTING)
@@ -282,12 +292,11 @@ top:
/*
* The only conflict predicate locking cares about for indexes is when
- * an index tuple insert conflicts with an existing lock. Since the
- * actual location of the insert is hard to predict because of the
- * random search used to prevent O(N^2) performance when there are
- * many duplicate entries, we can just use the "first valid" page.
- * This reasoning also applies to INCLUDE indexes, whose extra
- * attributes are not considered part of the key space.
+ * an index tuple insert conflicts with an existing lock. We don't
+ * know the actual page we're going to insert to yet because scantid
+ * was not filled in initially, but it's okay to use the "first valid"
+ * page instead. This reasoning also applies to INCLUDE indexes,
+ * whose extra attributes are not considered part of the key space.
*/
CheckForSerializableConflictIn(rel, NULL, insertstate.buf);
@@ -298,8 +307,8 @@ top:
*/
newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
stack, heapRel);
- _bt_insertonpg(rel, insertstate.buf, InvalidBuffer, stack, itup,
- newitemoff, false);
+ _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
+ itup, newitemoff, false);
}
else
{
@@ -371,6 +380,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
* Scan over all equal tuples, looking for live conflicts.
*/
Assert(!insertstate->bounds_valid || insertstate->low == offset);
+ Assert(itup_key->scantid == NULL);
for (;;)
{
ItemId curitemid;
@@ -642,18 +652,21 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
/*
* _bt_findinsertloc() -- Finds an insert location for a tuple
*
- * On entry, insertstate buffer contains the first legal page the new
- * tuple could be inserted to. It is exclusive-locked and pinned by the
- * caller.
+ * On entry, insertstate buffer contains the page the new tuple belongs
+ * on. It is exclusive-locked and pinned by the caller.
+ *
+ * If 'checkingunique' is true, the buffer on entry is the first page
+ * that contains duplicates of the new key. If there are duplicates on
+ * multiple pages, the correct insertion position might be some page to
+ * the right, rather than the first page. In that case, this function
+ * moves right to the correct target page.
*
- * If the new key is equal to one or more existing keys, we can
- * legitimately place it anywhere in the series of equal keys --- in fact,
- * if the new key is equal to the page's "high key" we can place it on
- * the next page. If it is equal to the high key, and there's not room
- * to insert the new tuple on the current page without splitting, then
- * we can move right hoping to find more free space and avoid a split.
- * Furthermore, if there's not enough room on a page, we try to make
- * room by removing any LP_DEAD tuples.
+ * (In a !heapkeyspace index, there can be multiple pages with the same
+ * high key, where the new tuple could legitimately be placed on. In
+ * that case, the caller passes the first page containing duplicates,
+ * just like when checkinunique=true. If that page doesn't have enough
+ * room for the new tuple, this function moves right, trying to find a
+ * legal page that does.)
*
* On exit, insertstate buffer contains the chosen insertion page, and
* the offset within that page is returned. If _bt_findinsertloc needed
@@ -663,6 +676,9 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
* If insertstate contains cached binary search bounds, we will take
* advantage of them. This avoids repeating comparisons that we made in
* _bt_check_unique() already.
+ *
+ * If there is not enough room on the page for the new tuple, we try to
+ * make room by removing any LP_DEAD tuples.
*/
static OffsetNumber
_bt_findinsertloc(Relation rel,
@@ -677,87 +693,144 @@ _bt_findinsertloc(Relation rel,
lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
- /*
- * Check whether the item can fit on a btree page at all. (Eventually, we
- * ought to try to apply TOAST methods if not.) We actually need to be
- * able to fit three items on every page, so restrict any one item to 1/3
- * the per-page available space. Note that at this point, itemsz doesn't
- * include the ItemId.
- *
- * NOTE: if you change this, see also the similar code in _bt_buildadd().
- */
- if (insertstate->itemsz > BTMaxItemSize(page))
- ereport(ERROR,
- (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
- insertstate->itemsz, BTMaxItemSize(page),
- RelationGetRelationName(rel)),
- errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
- "Consider a function index of an MD5 hash of the value, "
- "or use full text indexing."),
- errtableconstraint(heapRel,
- RelationGetRelationName(rel))));
+ /* Check 1/3 of a page restriction */
+ if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
+ _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
+ insertstate->itup);
- /*----------
- * If we will need to split the page to put the item on this page,
- * check whether we can put the tuple somewhere to the right,
- * instead. Keep scanning right until we
- * (a) find a page with enough free space,
- * (b) reach the last page where the tuple can legally go, or
- * (c) get tired of searching.
- * (c) is not flippant; it is important because if there are many
- * pages' worth of equal keys, it's better to split one of the early
- * pages than to scan all the way to the end of the run of equal keys
- * on every insert. We implement "get tired" as a random choice,
- * since stopping after scanning a fixed number of pages wouldn't work
- * well (we'd never reach the right-hand side of previously split
- * pages). Currently the probability of moving right is set at 0.99,
- * which may seem too high to change the behavior much, but it does an
- * excellent job of preventing O(N^2) behavior with many equal keys.
- *----------
- */
Assert(P_ISLEAF(lpageop) && !P_INCOMPLETE_SPLIT(lpageop));
Assert(!insertstate->bounds_valid || checkingunique);
+ Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
+ Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
- while (PageGetFreeSpace(page) < insertstate->itemsz)
+ if (itup_key->heapkeyspace)
{
/*
- * before considering moving right, see if we can obtain enough space
- * by erasing LP_DEAD items
+ * If we're inserting into a unique index, we may have to walk right
+ * through leaf pages to find the one leaf page that we must insert on
+ * to.
+ *
+ * This is needed for checkingunique callers because a scantid was not
+ * used when we called _bt_search(). scantid can only be set after
+ * _bt_check_unique() has checked for duplicates. The buffer
+ * initially stored in insertstate->buf has the page where the first
+ * duplicate key might be found, which isn't always the page that new
+ * tuple belongs on. The heap TID attribute for new tuple (scantid)
+ * could force us to insert on a sibling page, though that should be
+ * very rare in practice.
*/
- if (P_HAS_GARBAGE(lpageop))
+ if (checkingunique)
{
- _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
- insertstate->bounds_valid = false;
+ for (;;)
+ {
+ /*
+ * Does the new tuple belong on this page?
+ *
+ * The earlier _bt_check_unique() call may well have
+ * established a strict upper bound on the offset for the new
+ * item. If it's not the last item of the page (i.e. if there
+ * is at least one tuple on the page that goes after the tuple
+ * we're inserting) then we know that the tuple belongs on
+ * this page. We can skip the high key check.
+ */
+ if (insertstate->bounds_valid &&
+ insertstate->low <= insertstate->stricthigh &&
+ insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
+ break;
+
+ /* Test '<=', not '!=', since scantid is set now */
+ if (P_RIGHTMOST(lpageop) ||
+ _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
+ break;
- if (PageGetFreeSpace(page) >= insertstate->itemsz)
- break; /* OK, now we have enough space */
+ _bt_stepright(rel, insertstate, stack);
+ /* Update local state after stepping right */
+ page = BufferGetPage(insertstate->buf);
+ lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+ }
}
/*
- * Nope, so check conditions (b) and (c) enumerated above
+ * If the target page is full, see if we can obtain enough space by
+ * erasing LP_DEAD items
+ */
+ if (PageGetFreeSpace(page) < insertstate->itemsz &&
+ P_HAS_GARBAGE(lpageop))
+ {
+ _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
+ insertstate->bounds_valid = false;
+ }
+ }
+ else
+ {
+ /*----------
+ * This is a !heapkeyspace (version 2 or 3) index. The current page
+ * is the first page that we could insert the new tuple to, but there
+ * may be other pages to the right that we could opt to use instead.
*
- * The earlier _bt_check_unique() call may well have established a
- * strict upper bound on the offset for the new item. If it's not the
- * last item of the page (i.e. if there is at least one tuple on the
- * page that's greater than the tuple we're inserting to) then we know
- * that the tuple belongs on this page. We can skip the high key
- * check.
+ * If the new key is equal to one or more existing keys, we can
+ * legitimately place it anywhere in the series of equal keys. In
+ * fact, if the new key is equal to the page's "high key" we can place
+ * it on the next page. If it is equal to the high key, and there's
+ * not room to insert the new tuple on the current page without
+ * splitting, then we move right hoping to find more free space and
+ * avoid a split.
+ *
+ * Keep scanning right until we
+ * (a) find a page with enough free space,
+ * (b) reach the last page where the tuple can legally go, or
+ * (c) get tired of searching.
+ * (c) is not flippant; it is important because if there are many
+ * pages' worth of equal keys, it's better to split one of the early
+ * pages than to scan all the way to the end of the run of equal keys
+ * on every insert. We implement "get tired" as a random choice,
+ * since stopping after scanning a fixed number of pages wouldn't work
+ * well (we'd never reach the right-hand side of previously split
+ * pages). The probability of moving right is set at 0.99, which may
+ * seem too high to change the behavior much, but it does an excellent
+ * job of preventing O(N^2) behavior with many equal keys.
+ *----------
*/
- if (insertstate->bounds_valid &&
- insertstate->low <= insertstate->stricthigh &&
- insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
- break;
+ while (PageGetFreeSpace(page) < insertstate->itemsz)
+ {
+ /*
+ * Before considering moving right, see if we can obtain enough
+ * space by erasing LP_DEAD items
+ */
+ if (P_HAS_GARBAGE(lpageop))
+ {
+ _bt_vacuum_one_page(rel, insertstate->buf, heapRel);
+ insertstate->bounds_valid = false;
- if (P_RIGHTMOST(lpageop) ||
- _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
- random() <= (MAX_RANDOM_VALUE / 100))
- break;
+ if (PageGetFreeSpace(page) >= insertstate->itemsz)
+ break; /* OK, now we have enough space */
+ }
- _bt_stepright(rel, insertstate, stack);
- /* Update local state after stepping right */
- page = BufferGetPage(insertstate->buf);
- lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+ /*
+ * Nope, so check conditions (b) and (c) enumerated above
+ *
+ * The earlier _bt_check_unique() call may well have established a
+ * strict upper bound on the offset for the new item. If it's not
+ * the last item of the page (i.e. if there is at least one tuple
+ * on the page that's greater than the tuple we're inserting to)
+ * then we know that the tuple belongs on this page. We can skip
+ * the high key check.
+ */
+ if (insertstate->bounds_valid &&
+ insertstate->low <= insertstate->stricthigh &&
+ insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
+ break;
+
+ if (P_RIGHTMOST(lpageop) ||
+ _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
+ random() <= (MAX_RANDOM_VALUE / 100))
+ break;
+
+ _bt_stepright(rel, insertstate, stack);
+ /* Update local state after stepping right */
+ page = BufferGetPage(insertstate->buf);
+ lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+ }
}
/*
@@ -778,6 +851,9 @@ _bt_findinsertloc(Relation rel,
* else someone else's _bt_check_unique scan could fail to see our insertion.
* Write locks on intermediate dead pages won't do because we don't know when
* they will get de-linked from the tree.
+ *
+ * This is more aggressive than it needs to be for non-unique !heapkeyspace
+ * indexes.
*/
static void
_bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
@@ -830,8 +906,9 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
*
* This recursive procedure does the following things:
*
- * + if necessary, splits the target page (making sure that the
- * split is equitable as far as post-insert free space goes).
+ * + if necessary, splits the target page, using 'itup_key' for
+ * suffix truncation on leaf pages (caller passes NULL for
+ * non-leaf pages).
* + inserts the tuple.
* + if the page was split, pops the parent stack, and finds the
* right place to insert the new child pointer (by walking
@@ -857,6 +934,7 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
*/
static void
_bt_insertonpg(Relation rel,
+ BTScanInsert itup_key,
Buffer buf,
Buffer cbuf,
BTStack stack,
@@ -879,7 +957,7 @@ _bt_insertonpg(Relation rel,
BTreeTupleGetNAtts(itup, rel) ==
IndexRelationGetNumberOfAttributes(rel));
Assert(P_ISLEAF(lpageop) ||
- BTreeTupleGetNAtts(itup, rel) ==
+ BTreeTupleGetNAtts(itup, rel) <=
IndexRelationGetNumberOfKeyAttributes(rel));
/* The caller should've finished any incomplete splits already. */
@@ -929,8 +1007,8 @@ _bt_insertonpg(Relation rel,
&newitemonleft);
/* split the buffer into left and right halves */
- rbuf = _bt_split(rel, buf, cbuf, firstright,
- newitemoff, itemsz, itup, newitemonleft);
+ rbuf = _bt_split(rel, itup_key, buf, cbuf, firstright, newitemoff,
+ itemsz, itup, newitemonleft);
PredicateLockPageSplit(rel,
BufferGetBlockNumber(buf),
BufferGetBlockNumber(rbuf));
@@ -1014,7 +1092,7 @@ _bt_insertonpg(Relation rel,
if (BufferIsValid(metabuf))
{
/* upgrade meta-page if needed */
- if (metad->btm_version < BTREE_VERSION)
+ if (metad->btm_version < BTREE_NOVAC_VERSION)
_bt_upgrademetapage(metapg);
metad->btm_fastroot = itup_blkno;
metad->btm_fastlevel = lpageop->btpo.level;
@@ -1069,6 +1147,8 @@ _bt_insertonpg(Relation rel,
if (BufferIsValid(metabuf))
{
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ xlmeta.version = metad->btm_version;
xlmeta.root = metad->btm_root;
xlmeta.level = metad->btm_level;
xlmeta.fastroot = metad->btm_fastroot;
@@ -1136,17 +1216,19 @@ _bt_insertonpg(Relation rel,
* new right page. newitemoff etc. tell us about the new item that
* must be inserted along with the data from the old page.
*
- * When splitting a non-leaf page, 'cbuf' is the left-sibling of the
- * page we're inserting the downlink for. This function will clear the
- * INCOMPLETE_SPLIT flag on it, and release the buffer.
+ * itup_key is used for suffix truncation on leaf pages (internal
+ * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
+ * is the left-sibling of the page we're inserting the downlink for.
+ * This function will clear the INCOMPLETE_SPLIT flag on it, and
+ * release the buffer.
*
* Returns the new right sibling of buf, pinned and write-locked.
* The pin and lock on buf are maintained.
*/
static Buffer
-_bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
- OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
- bool newitemonleft)
+_bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
+ OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz,
+ IndexTuple newitem, bool newitemonleft)
{
Buffer rbuf;
Page origpage;
@@ -1240,7 +1322,8 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
itemid = PageGetItemId(origpage, P_HIKEY);
itemsz = ItemIdGetLength(itemid);
item = (IndexTuple) PageGetItem(origpage, itemid);
- Assert(BTreeTupleGetNAtts(item, rel) == indnkeyatts);
+ Assert(BTreeTupleGetNAtts(item, rel) > 0);
+ Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts);
if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
false, false) == InvalidOffsetNumber)
{
@@ -1254,8 +1337,29 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
/*
* The "high key" for the new left page will be the first key that's going
- * to go into the new right page. This might be either the existing data
- * item at position firstright, or the incoming tuple.
+ * to go into the new right page, or possibly a truncated version if this
+ * is a leaf page split. This might be either the existing data item at
+ * position firstright, or the incoming tuple.
+ *
+ * The high key for the left page is formed using the first item on the
+ * right page, which may seem to be contrary to Lehman & Yao's approach of
+ * using the left page's last item as its new high key when splitting on
+ * the leaf level. It isn't, though: suffix truncation will leave the
+ * left page's high key fully equal to the last item on the left page when
+ * two tuples with equal key values (excluding heap TID) enclose the split
+ * point. It isn't actually necessary for a new leaf high key to be equal
+ * to the last item on the left for the L&Y "subtree" invariant to hold.
+ * It's sufficient to make sure that the new leaf high key is strictly
+ * less than the first item on the right leaf page, and greater than or
+ * equal to (not necessarily equal to) the last item on the left leaf
+ * page.
+ *
+ * In other words, when suffix truncation isn't possible, L&Y's exact
+ * approach to leaf splits is taken. (Actually, even that is slightly
+ * inaccurate. A tuple with all the keys from firstright but the heap TID
+ * from lastleft will be used as the new high key, since the last left
+ * tuple could be physically larger despite being opclass-equal in respect
+ * of all attributes prior to the heap TID attribute.)
*/
leftoff = P_HIKEY;
if (!newitemonleft && newitemoff == firstright)
@@ -1273,25 +1377,48 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
}
/*
- * Truncate non-key (INCLUDE) attributes of the high key item before
- * inserting it on the left page. This only needs to happen at the leaf
+ * Truncate unneeded key and non-key attributes of the high key item
+ * before inserting it on the left page. This can only happen at the leaf
* level, since in general all pivot tuple values originate from leaf
- * level high keys. This isn't just about avoiding unnecessary work,
- * though; truncating unneeded key attributes (more aggressive suffix
- * truncation) can only be performed at the leaf level anyway. This is
- * because a pivot tuple in a grandparent page must guide a search not
- * only to the correct parent page, but also to the correct leaf page.
+ * level high keys. A pivot tuple in a grandparent page must guide a
+ * search not only to the correct parent page, but also to the correct
+ * leaf page.
*/
- if (indnatts != indnkeyatts && isleaf)
+ if (isleaf && (itup_key->heapkeyspace || indnatts != indnkeyatts))
{
- lefthikey = _bt_nonkey_truncate(rel, item);
+ IndexTuple lastleft;
+
+ /*
+ * Determine which tuple will become the last on the left page. This
+ * is needed to decide how many attributes from the first item on the
+ * right page must remain in new high key for left page.
+ */
+ if (newitemonleft && newitemoff == firstright)
+ {
+ /* incoming tuple will become last on left page */
+ lastleft = newitem;
+ }
+ else
+ {
+ OffsetNumber lastleftoff;
+
+ /* item just before firstright will become last on left page */
+ lastleftoff = OffsetNumberPrev(firstright);
+ Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
+ itemid = PageGetItemId(origpage, lastleftoff);
+ lastleft = (IndexTuple) PageGetItem(origpage, itemid);
+ }
+
+ Assert(lastleft != item);
+ lefthikey = _bt_truncate(rel, lastleft, item, itup_key);
itemsz = IndexTupleSize(lefthikey);
itemsz = MAXALIGN(itemsz);
}
else
lefthikey = item;
- Assert(BTreeTupleGetNAtts(lefthikey, rel) == indnkeyatts);
+ Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0);
+ Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts);
if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
false, false) == InvalidOffsetNumber)
{
@@ -1484,7 +1611,6 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
xl_btree_split xlrec;
uint8 xlinfo;
XLogRecPtr recptr;
- bool loglhikey = false;
xlrec.level = ropaque->btpo.level;
xlrec.firstright = firstright;
@@ -1513,22 +1639,10 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
if (newitemonleft)
XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
- /* Log left page */
- if (!isleaf || indnatts != indnkeyatts)
- {
- /*
- * We must also log the left page's high key. There are two
- * reasons for that: right page's leftmost key is suppressed on
- * non-leaf levels and in covering indexes included columns are
- * truncated from high keys. Show it as belonging to the left
- * page buffer, so that it is not stored if XLogInsert decides it
- * needs a full-page image of the left page.
- */
- itemid = PageGetItemId(origpage, P_HIKEY);
- item = (IndexTuple) PageGetItem(origpage, itemid);
- XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
- loglhikey = true;
- }
+ /* Log the left page's new high key */
+ itemid = PageGetItemId(origpage, P_HIKEY);
+ item = (IndexTuple) PageGetItem(origpage, itemid);
+ XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
/*
* Log the contents of the right page in the format understood by
@@ -1544,9 +1658,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
(char *) rightpage + ((PageHeader) rightpage)->pd_upper,
((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
- xlinfo = newitemonleft ?
- (loglhikey ? XLOG_BTREE_SPLIT_L_HIGHKEY : XLOG_BTREE_SPLIT_L) :
- (loglhikey ? XLOG_BTREE_SPLIT_R_HIGHKEY : XLOG_BTREE_SPLIT_R);
+ xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
recptr = XLogInsert(RM_BTREE_ID, xlinfo);
PageSetLSN(origpage, recptr);
@@ -1909,7 +2021,7 @@ _bt_insert_parent(Relation rel,
_bt_relbuf(rel, pbuf);
}
- /* get high key from left page == lower bound for new right page */
+ /* get high key from left, a strict lower bound for new right page */
ritem = (IndexTuple) PageGetItem(page,
PageGetItemId(page, P_HIKEY));
@@ -1939,7 +2051,7 @@ _bt_insert_parent(Relation rel,
RelationGetRelationName(rel), bknum, rbknum);
/* Recursively update the parent */
- _bt_insertonpg(rel, pbuf, buf, stack->bts_parent,
+ _bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
new_item, stack->bts_offset + 1,
is_only);
@@ -2200,7 +2312,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
START_CRIT_SECTION();
/* upgrade metapage if needed */
- if (metad->btm_version < BTREE_VERSION)
+ if (metad->btm_version < BTREE_NOVAC_VERSION)
_bt_upgrademetapage(metapg);
/* set btree special data */
@@ -2235,7 +2347,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
/*
* insert the right page pointer into the new root page.
*/
- Assert(BTreeTupleGetNAtts(right_item, rel) ==
+ Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
+ Assert(BTreeTupleGetNAtts(right_item, rel) <=
IndexRelationGetNumberOfKeyAttributes(rel));
if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
false, false) == InvalidOffsetNumber)
@@ -2268,6 +2381,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ md.version = metad->btm_version;
md.root = rootblknum;
md.level = metad->btm_level;
md.fastroot = rootblknum;
@@ -2332,6 +2447,7 @@ _bt_pgaddtup(Page page,
{
trunctuple = *itup;
trunctuple.t_info = sizeof(IndexTupleData);
+ /* Deliberately zero INDEX_ALT_TID_MASK bits */
BTreeTupleSetNAtts(&trunctuple, 0);
itup = &trunctuple;
itemsize = sizeof(IndexTupleData);
@@ -2347,8 +2463,8 @@ _bt_pgaddtup(Page page,
/*
* _bt_isequal - used in _bt_doinsert in check for duplicates.
*
- * This is very similar to _bt_compare, except for NULL handling.
- * Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
+ * This is very similar to _bt_compare, except for NULL and negative infinity
+ * handling. Rule is simple: NOT_NULL not equal NULL, NULL not equal NULL too.
*/
static bool
_bt_isequal(TupleDesc itupdesc, BTScanInsert itup_key, Page page,
@@ -2361,6 +2477,7 @@ _bt_isequal(TupleDesc itupdesc, BTScanInsert itup_key, Page page,
/* Better be comparing to a non-pivot item */
Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page)));
Assert(offnum >= P_FIRSTDATAKEY((BTPageOpaque) PageGetSpecialPointer(page)));
+ Assert(itup_key->scantid == NULL);
scankey = itup_key->scankeys;
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));