aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/nbtree/nbtinsert.c343
-rw-r--r--src/backend/access/nbtree/nbtsort.c42
-rw-r--r--src/backend/access/nbtree/nbtsplitloc.c202
-rw-r--r--src/backend/access/nbtree/nbtutils.c11
-rw-r--r--src/backend/access/nbtree/nbtxlog.c13
-rw-r--r--src/backend/access/rmgrdesc/nbtdesc.c4
-rw-r--r--src/include/access/nbtxlog.h8
7 files changed, 333 insertions, 290 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index cbd4ec3b39e..fd518d3ea5a 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -56,8 +56,8 @@ static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
BTStack stack, bool is_root, bool is_only);
static Buffer _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
-static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
- OffsetNumber itup_off);
+static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
+ OffsetNumber itup_off, bool newfirstdataitem);
static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
/*
@@ -1452,18 +1452,18 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
BTPageOpaque sopaque = NULL;
Size itemsz;
ItemId itemid;
- IndexTuple item;
- OffsetNumber leftoff,
- rightoff;
- OffsetNumber firstright;
+ IndexTuple firstright,
+ lefthighkey;
+ OffsetNumber firstrightoff;
+ OffsetNumber afterleftoff,
+ afterrightoff,
+ minusinfoff;
OffsetNumber origpagepostingoff;
OffsetNumber maxoff;
OffsetNumber i;
bool newitemonleft,
- isleaf;
- IndexTuple lefthikey;
- int indnatts = IndexRelationGetNumberOfAttributes(rel);
- int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
+ isleaf,
+ isrightmost;
/*
* origpage is the original page to be split. leftpage is a temporary
@@ -1480,25 +1480,37 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
*/
origpage = BufferGetPage(buf);
oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
+ isleaf = P_ISLEAF(oopaque);
+ isrightmost = P_RIGHTMOST(oopaque);
+ maxoff = PageGetMaxOffsetNumber(origpage);
origpagenumber = BufferGetBlockNumber(buf);
/*
* Choose a point to split origpage at.
*
- * A split point can be thought of as a point _between_ two existing
- * tuples on origpage (lastleft and firstright tuples), provided you
+ * A split point can be thought of as a point _between_ two existing data
+ * items on origpage (the lastleft and firstright tuples), provided you
* pretend that the new item that didn't fit is already on origpage.
*
* Since origpage does not actually contain newitem, the representation of
* split points needs to work with two boundary cases: splits where
* newitem is lastleft, and splits where newitem is firstright.
* newitemonleft resolves the ambiguity that would otherwise exist when
- * newitemoff == firstright. In all other cases it's clear which side of
- * the split every tuple goes on from context. newitemonleft is usually
- * (but not always) redundant information.
+ * newitemoff == firstrightoff. In all other cases it's clear which side
+ * of the split every tuple goes on from context. newitemonleft is
+ * usually (but not always) redundant information.
+ *
+ * firstrightoff is supposed to be an origpage offset number, but it's
+ * possible that its value will be maxoff+1, which is "past the end" of
+ * origpage. This happens in the rare case where newitem goes after all
+ * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
+ * origpage at the point that leaves newitem alone on new right page. Any
+ * "!newitemonleft && newitemoff == firstrightoff" split point makes
+ * newitem the firstright tuple, though, so this case isn't a special
+ * case.
*/
- firstright = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
- newitem, &newitemonleft);
+ firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
+ newitem, &newitemonleft);
/* Allocate temp buffer for leftpage */
leftpage = PageGetTempPage(origpage);
@@ -1524,7 +1536,6 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
* examine the LSN and possibly dump it in a page image.
*/
PageSetLSN(leftpage, PageGetLSN(origpage));
- isleaf = P_ISLEAF(oopaque);
/*
* Determine page offset number of existing overlapped-with-orignewitem
@@ -1555,74 +1566,57 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
}
/*
- * The "high key" for the new left page will be the first key that's going
- * to go into the new right page, or a truncated version if this is a leaf
- * page split.
- *
- * The high key for the left page is formed using the first item on the
- * right page, which may seem to be contrary to Lehman & Yao's approach of
- * using the left page's last item as its new high key when splitting on
- * the leaf level. It isn't, though: suffix truncation will leave the
- * left page's high key fully equal to the last item on the left page when
- * two tuples with equal key values (excluding heap TID) enclose the split
- * point. It isn't actually necessary for a new leaf high key to be equal
- * to the last item on the left for the L&Y "subtree" invariant to hold.
- * It's sufficient to make sure that the new leaf high key is strictly
- * less than the first item on the right leaf page, and greater than or
- * equal to (not necessarily equal to) the last item on the left leaf
- * page.
+ * The high key for the new left page is a possibly-truncated copy of
+ * firstright on the leaf level (it's "firstright itself" on internal
+ * pages; see !isleaf comments below). This may seem to be contrary to
+ * Lehman & Yao's approach of using a copy of lastleft as the new high key
+ * when splitting on the leaf level. It isn't, though.
*
- * In other words, when suffix truncation isn't possible, L&Y's exact
- * approach to leaf splits is taken. (Actually, even that is slightly
- * inaccurate. A tuple with all the keys from firstright but the heap TID
- * from lastleft will be used as the new high key, since the last left
- * tuple could be physically larger despite being opclass-equal in respect
- * of all attributes prior to the heap TID attribute.)
+ * Suffix truncation will leave the left page's high key fully equal to
+ * lastleft when lastleft and firstright are equal prior to heap TID (that
+ * is, the tiebreaker TID value comes from lastleft). It isn't actually
+ * necessary for a new leaf high key to be a copy of lastleft for the L&Y
+ * "subtree" invariant to hold. It's sufficient to make sure that the new
+ * leaf high key is strictly less than firstright, and greater than or
+ * equal to (not necessarily equal to) lastleft. In other words, when
+ * suffix truncation isn't possible during a leaf page split, we take
+ * L&Y's exact approach to generating a new high key for the left page.
+ * (Actually, that is slightly inaccurate. We don't just use a copy of
+ * lastleft. A tuple with all the keys from firstright but the max heap
+ * TID from lastleft is used, to avoid introducing a special case.)
*/
- if (!newitemonleft && newitemoff == firstright)
+ if (!newitemonleft && newitemoff == firstrightoff)
{
- /* incoming tuple will become first on right page */
+ /* incoming tuple becomes firstright */
itemsz = newitemsz;
- item = newitem;
+ firstright = newitem;
}
else
{
- /* existing item at firstright will become first on right page */
- itemid = PageGetItemId(origpage, firstright);
+ /* existing item at firstrightoff becomes firstright */
+ itemid = PageGetItemId(origpage, firstrightoff);
itemsz = ItemIdGetLength(itemid);
- item = (IndexTuple) PageGetItem(origpage, itemid);
- if (firstright == origpagepostingoff)
- item = nposting;
+ firstright = (IndexTuple) PageGetItem(origpage, itemid);
+ if (firstrightoff == origpagepostingoff)
+ firstright = nposting;
}
- /*
- * Truncate unneeded key and non-key attributes of the high key item
- * before inserting it on the left page. This can only happen at the leaf
- * level, since in general all pivot tuple values originate from leaf
- * level high keys. A pivot tuple in a grandparent page must guide a
- * search not only to the correct parent page, but also to the correct
- * leaf page.
- */
- if (isleaf && (itup_key->heapkeyspace || indnatts != indnkeyatts))
+ if (isleaf)
{
IndexTuple lastleft;
- /*
- * Determine which tuple will become the last on the left page. This
- * is needed to decide how many attributes from the first item on the
- * right page must remain in new high key for left page.
- */
- if (newitemonleft && newitemoff == firstright)
+ /* Attempt suffix truncation for leaf page splits */
+ if (newitemonleft && newitemoff == firstrightoff)
{
- /* incoming tuple will become last on left page */
+ /* incoming tuple becomes lastleft */
lastleft = newitem;
}
else
{
OffsetNumber lastleftoff;
- /* item just before firstright will become last on left page */
- lastleftoff = OffsetNumberPrev(firstright);
+ /* existing item before firstrightoff becomes lastleft */
+ lastleftoff = OffsetNumberPrev(firstrightoff);
Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
itemid = PageGetItemId(origpage, lastleftoff);
lastleft = (IndexTuple) PageGetItem(origpage, itemid);
@@ -1630,30 +1624,55 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
lastleft = nposting;
}
- Assert(lastleft != item);
- lefthikey = _bt_truncate(rel, lastleft, item, itup_key);
- itemsz = IndexTupleSize(lefthikey);
- itemsz = MAXALIGN(itemsz);
+ lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
+ itemsz = IndexTupleSize(lefthighkey);
}
else
- lefthikey = item;
+ {
+ /*
+ * Don't perform suffix truncation on a copy of firstright to make
+ * left page high key for internal page splits. Must use firstright
+ * as new high key directly.
+ *
+ * Each distinct separator key value originates as a leaf level high
+ * key; all other separator keys/pivot tuples are copied from one
+ * level down. A separator key in a grandparent page must be
+ * identical to high key in rightmost parent page of the subtree to
+ * its left, which must itself be identical to high key in rightmost
+ * child page of that same subtree (this even applies to separator
+ * from grandparent's high key). There must always be an unbroken
+ * "seam" of identical separator keys that guide index scans at every
+ * level, starting from the grandparent. That's why suffix truncation
+ * is unsafe here.
+ *
+ * Internal page splits will truncate firstright into a "negative
+ * infinity" data item when it gets inserted on the new right page
+ * below, though. This happens during the call to _bt_pgaddtup() for
+ * the new first data item for right page. Do not confuse this
+ * mechanism with suffix truncation. It is just a convenient way of
+ * implementing page splits that split the internal page "inside"
+ * firstright. The lefthighkey separator key cannot appear a second
+ * time in the right page (only firstright's downlink goes in right
+ * page).
+ */
+ lefthighkey = firstright;
+ }
/*
* Add new high key to leftpage
*/
- leftoff = P_HIKEY;
+ afterleftoff = P_HIKEY;
- Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0);
- Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts);
- if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff,
- false, false) == InvalidOffsetNumber)
- elog(ERROR, "failed to add hikey to the left sibling"
+ Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
+ Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
+ IndexRelationGetNumberOfKeyAttributes(rel));
+ Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
+ if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
+ false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add high key to the left sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
- leftoff = OffsetNumberNext(leftoff);
- /* be tidy */
- if (lefthikey != item)
- pfree(lefthikey);
+ afterleftoff = OffsetNumberNext(afterleftoff);
/*
* Acquire a new right page to split into, now that left page has a new
@@ -1700,47 +1719,58 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
* the tree, then the first entry on the page is the high key from
* origpage.
*/
- rightoff = P_HIKEY;
+ afterrightoff = P_HIKEY;
- if (!P_RIGHTMOST(oopaque))
+ if (!isrightmost)
{
+ IndexTuple righthighkey;
+
itemid = PageGetItemId(origpage, P_HIKEY);
itemsz = ItemIdGetLength(itemid);
- item = (IndexTuple) PageGetItem(origpage, itemid);
- Assert(BTreeTupleGetNAtts(item, rel) > 0);
- Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts);
- if (PageAddItem(rightpage, (Item) item, itemsz, rightoff,
+ righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
+ Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
+ Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
+ IndexRelationGetNumberOfKeyAttributes(rel));
+ if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
false, false) == InvalidOffsetNumber)
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
- elog(ERROR, "failed to add hikey to the right sibling"
+ elog(ERROR, "failed to add high key to the right sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
}
- rightoff = OffsetNumberNext(rightoff);
+ afterrightoff = OffsetNumberNext(afterrightoff);
}
/*
+ * Internal page splits truncate first data item on right page -- it
+ * becomes "minus infinity" item for the page. Set this up here.
+ */
+ minusinfoff = InvalidOffsetNumber;
+ if (!isleaf)
+ minusinfoff = afterrightoff;
+
+ /*
* Now transfer all the data items (non-pivot tuples in isleaf case, or
* additional pivot tuples in !isleaf case) to the appropriate page.
*
* Note: we *must* insert at least the right page's items in item-number
* order, for the benefit of _bt_restore_page().
*/
- maxoff = PageGetMaxOffsetNumber(origpage);
-
for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
{
+ IndexTuple dataitem;
+
itemid = PageGetItemId(origpage, i);
itemsz = ItemIdGetLength(itemid);
- item = (IndexTuple) PageGetItem(origpage, itemid);
+ dataitem = (IndexTuple) PageGetItem(origpage, itemid);
/* replace original item with nposting due to posting split? */
if (i == origpagepostingoff)
{
- Assert(BTreeTupleIsPosting(item));
+ Assert(BTreeTupleIsPosting(dataitem));
Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
- item = nposting;
+ dataitem = nposting;
}
/* does new item belong before this one? */
@@ -1748,56 +1778,59 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
{
if (newitemonleft)
{
- Assert(newitemoff <= firstright);
- if (!_bt_pgaddtup(leftpage, newitemsz, newitem, leftoff))
+ Assert(newitemoff <= firstrightoff);
+ if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
+ false))
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add new item to the left sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
}
- leftoff = OffsetNumberNext(leftoff);
+ afterleftoff = OffsetNumberNext(afterleftoff);
}
else
{
- Assert(newitemoff >= firstright);
- if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
+ Assert(newitemoff >= firstrightoff);
+ if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
+ afterrightoff == minusinfoff))
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add new item to the right sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
}
- rightoff = OffsetNumberNext(rightoff);
+ afterrightoff = OffsetNumberNext(afterrightoff);
}
}
/* decide which page to put it on */
- if (i < firstright)
+ if (i < firstrightoff)
{
- if (!_bt_pgaddtup(leftpage, itemsz, item, leftoff))
+ if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add old item to the left sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
}
- leftoff = OffsetNumberNext(leftoff);
+ afterleftoff = OffsetNumberNext(afterleftoff);
}
else
{
- if (!_bt_pgaddtup(rightpage, itemsz, item, rightoff))
+ if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
+ afterrightoff == minusinfoff))
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add old item to the right sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
}
- rightoff = OffsetNumberNext(rightoff);
+ afterrightoff = OffsetNumberNext(afterrightoff);
}
}
- /* cope with possibility that newitem goes at the end */
+ /* Handle case where newitem goes at the end of rightpage */
if (i <= newitemoff)
{
/*
@@ -1805,15 +1838,16 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
* *everything* on the left page, which cannot fit (if it could, we'd
* not be splitting the page).
*/
- Assert(!newitemonleft);
- if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
+ Assert(!newitemonleft && newitemoff == maxoff + 1);
+ if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
+ afterrightoff == minusinfoff))
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add new item to the right sibling"
" while splitting block %u of index \"%s\"",
origpagenumber, RelationGetRelationName(rel));
}
- rightoff = OffsetNumberNext(rightoff);
+ afterrightoff = OffsetNumberNext(afterrightoff);
}
/*
@@ -1823,7 +1857,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
* all readers release locks on a page before trying to fetch its
* neighbors.
*/
- if (!P_RIGHTMOST(oopaque))
+ if (!isrightmost)
{
sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
spage = BufferGetPage(sbuf);
@@ -1886,7 +1920,7 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
MarkBufferDirty(buf);
MarkBufferDirty(rbuf);
- if (!P_RIGHTMOST(ropaque))
+ if (!isrightmost)
{
sopaque->btpo_prev = rightpagenumber;
MarkBufferDirty(sbuf);
@@ -1914,10 +1948,10 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
xlrec.level = ropaque->btpo.level;
/* See comments below on newitem, orignewitem, and posting lists */
- xlrec.firstright = firstright;
+ xlrec.firstrightoff = firstrightoff;
xlrec.newitemoff = newitemoff;
xlrec.postingoff = 0;
- if (postingoff != 0 && origpagepostingoff < firstright)
+ if (postingoff != 0 && origpagepostingoff < firstrightoff)
xlrec.postingoff = postingoff;
XLogBeginInsert();
@@ -1925,10 +1959,10 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
- /* Log the right sibling, because we've changed its prev-pointer. */
- if (!P_RIGHTMOST(ropaque))
+ /* Log original right sibling, since we've changed its prev-pointer */
+ if (!isrightmost)
XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
- if (BufferIsValid(cbuf))
+ if (!isleaf)
XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
/*
@@ -1959,18 +1993,24 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
* newitem-logged case).
*/
if (newitemonleft && xlrec.postingoff == 0)
- XLogRegisterBufData(0, (char *) newitem, MAXALIGN(newitemsz));
+ XLogRegisterBufData(0, (char *) newitem, newitemsz);
else if (xlrec.postingoff != 0)
{
- Assert(newitemonleft || firstright == newitemoff);
- Assert(MAXALIGN(newitemsz) == IndexTupleSize(orignewitem));
- XLogRegisterBufData(0, (char *) orignewitem, MAXALIGN(newitemsz));
+ Assert(isleaf);
+ Assert(newitemonleft || firstrightoff == newitemoff);
+ Assert(newitemsz == IndexTupleSize(orignewitem));
+ XLogRegisterBufData(0, (char *) orignewitem, newitemsz);
}
/* Log the left page's new high key */
- itemid = PageGetItemId(origpage, P_HIKEY);
- item = (IndexTuple) PageGetItem(origpage, itemid);
- XLogRegisterBufData(0, (char *) item, MAXALIGN(IndexTupleSize(item)));
+ if (!isleaf)
+ {
+ /* lefthighkey isn't local copy, get current pointer */
+ itemid = PageGetItemId(origpage, P_HIKEY);
+ lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
+ }
+ XLogRegisterBufData(0, (char *) lefthighkey,
+ MAXALIGN(IndexTupleSize(lefthighkey)));
/*
* Log the contents of the right page in the format understood by
@@ -1991,26 +2031,26 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
PageSetLSN(origpage, recptr);
PageSetLSN(rightpage, recptr);
- if (!P_RIGHTMOST(ropaque))
- {
+ if (!isrightmost)
PageSetLSN(spage, recptr);
- }
if (!isleaf)
- {
PageSetLSN(BufferGetPage(cbuf), recptr);
- }
}
END_CRIT_SECTION();
/* release the old right sibling */
- if (!P_RIGHTMOST(ropaque))
+ if (!isrightmost)
_bt_relbuf(rel, sbuf);
/* release the child */
if (!isleaf)
_bt_relbuf(rel, cbuf);
+ /* be tidy */
+ if (isleaf)
+ pfree(lefthighkey);
+
/* split's done */
return rbuf;
}
@@ -2405,9 +2445,9 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
metad = BTPageGetMeta(metapg);
/*
- * Create downlink item for left page (old root). Since this will be the
- * first item in a non-leaf page, it implicitly has minus-infinity key
- * value, so we need not store any actual key in it.
+ * Create downlink item for left page (old root). The key value used is
+ * "minus infinity", a sentinel value that's reliably less than any real
+ * key value that could appear in the left page.
*/
left_item_sz = sizeof(IndexTupleData);
left_item = (IndexTuple) palloc(left_item_sz);
@@ -2541,33 +2581,30 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
* _bt_pgaddtup() -- add a data item to a particular page during split.
*
* The difference between this routine and a bare PageAddItem call is
- * that this code knows that the leftmost data item on an internal
- * btree page has a key that must be treated as minus infinity.
- * Therefore, it truncates away all attributes. This extra step is
- * only needed during internal page splits.
- *
- * Truncation of an internal page data item can be thought of as one
- * of the steps used to "move" a boundary separator key during an
- * internal page split. Conceptually, _bt_split() caller splits
- * internal pages "inside" the firstright data item: firstright's
- * separator key is used as the high key for the left page, while its
- * downlink is used within the first data item (also the negative
- * infinity item) for the right page. Each distinct separator key
- * should appear no more than once per level of the tree.
+ * that this code can deal with the first data item on an internal btree
+ * page in passing. This data item (which is called "firstright" within
+ * _bt_split()) has a key that must be treated as minus infinity after
+ * the split. Therefore, we truncate away all attributes when caller
+ * specifies it's the first data item on page (downlink is not changed,
+ * though). This extra step is only needed for the right page of an
+ * internal page split. There is no need to do this for the first data
+ * item on the existing/left page, since that will already have been
+ * truncated during an earlier page split.
*
- * CAUTION: this works ONLY if we insert the tuples in order, so that
- * the given itup_off does represent the final position of the tuple!
+ * See _bt_split() for a high level explanation of why we truncate here.
+ * Note that this routine has nothing to do with suffix truncation,
+ * despite using some of the same infrastructure.
*/
-static bool
+static inline bool
_bt_pgaddtup(Page page,
Size itemsize,
IndexTuple itup,
- OffsetNumber itup_off)
+ OffsetNumber itup_off,
+ bool newfirstdataitem)
{
- BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
IndexTupleData trunctuple;
- if (!P_ISLEAF(opaque) && itup_off == P_FIRSTDATAKEY(opaque))
+ if (newfirstdataitem)
{
trunctuple = *itup;
trunctuple.t_info = sizeof(IndexTupleData);
@@ -2576,8 +2613,8 @@ _bt_pgaddtup(Page page,
itemsize = sizeof(IndexTupleData);
}
- if (PageAddItem(page, (Item) itup, itemsize, itup_off,
- false, false) == InvalidOffsetNumber)
+ if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
+ false) == InvalidOffsetNumber))
return false;
return true;
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index ba230bbcd2e..15f10a29d3d 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -269,7 +269,8 @@ static Page _bt_blnewpage(uint32 level);
static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
static void _bt_slideleft(Page page);
static void _bt_sortaddtup(Page page, Size itemsize,
- IndexTuple itup, OffsetNumber itup_off);
+ IndexTuple itup, OffsetNumber itup_off,
+ bool newfirstdataitem);
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
IndexTuple itup, Size truncextra);
static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
@@ -750,26 +751,24 @@ _bt_slideleft(Page page)
/*
* Add an item to a page being built.
*
- * The main difference between this routine and a bare PageAddItem call
- * is that this code knows that the leftmost data item on a non-leaf btree
- * page has a key that must be treated as minus infinity. Therefore, it
- * truncates away all attributes.
+ * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
+ * raises an error directly.
*
- * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use
- * that because it assumes that P_RIGHTMOST() will return the correct
- * answer for the page. Here, we don't know yet if the page will be
- * rightmost. Offset P_FIRSTKEY is always the first data key.
+ * Note that our nbtsort.c caller does not know yet if the page will be
+ * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by
+ * caller. Page that turns out to be the rightmost on its level is fixed by
+ * calling _bt_slideleft().
*/
static void
_bt_sortaddtup(Page page,
Size itemsize,
IndexTuple itup,
- OffsetNumber itup_off)
+ OffsetNumber itup_off,
+ bool newfirstdataitem)
{
- BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
IndexTupleData trunctuple;
- if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY)
+ if (newfirstdataitem)
{
trunctuple = *itup;
trunctuple.t_info = sizeof(IndexTupleData);
@@ -867,12 +866,13 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
* Every newly built index will treat heap TID as part of the keyspace,
* which imposes the requirement that new high keys must occasionally have
* a heap TID appended within _bt_truncate(). That may leave a new pivot
- * tuple one or two MAXALIGN() quantums larger than the original first
- * right tuple it's derived from. v4 deals with the problem by decreasing
- * the limit on the size of tuples inserted on the leaf level by the same
- * small amount. Enforce the new v4+ limit on the leaf level, and the old
- * limit on internal levels, since pivot tuples may need to make use of
- * the reserved space. This should never fail on internal pages.
+ * tuple one or two MAXALIGN() quantums larger than the original
+ * firstright tuple it's derived from. v4 deals with the problem by
+ * decreasing the limit on the size of tuples inserted on the leaf level
+ * by the same small amount. Enforce the new v4+ limit on the leaf level,
+ * and the old limit on internal levels, since pivot tuples may need to
+ * make use of the reserved space. This should never fail on internal
+ * pages.
*/
if (unlikely(itupsz > BTMaxItemSize(npage)))
_bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
@@ -925,7 +925,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
Assert(last_off > P_FIRSTKEY);
ii = PageGetItemId(opage, last_off);
oitup = (IndexTuple) PageGetItem(opage, ii);
- _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY);
+ _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
+ !isleaf);
/*
* Move 'last' into the high key position on opage. _bt_blnewpage()
@@ -1054,7 +1055,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
* Add the new item into the current page.
*/
last_off = OffsetNumberNext(last_off);
- _bt_sortaddtup(npage, itupsz, itup, last_off);
+ _bt_sortaddtup(npage, itupsz, itup, last_off,
+ !isleaf && last_off == P_FIRSTKEY);
state->btps_page = npage;
state->btps_blkno = nblkno;
diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c
index 8ba055be9e0..5f0d0be3c25 100644
--- a/src/backend/access/nbtree/nbtsplitloc.c
+++ b/src/backend/access/nbtree/nbtsplitloc.c
@@ -37,7 +37,7 @@ typedef struct
int16 rightfree; /* space left on right page post-split */
/* split point identifying fields (returned by _bt_findsplitloc) */
- OffsetNumber firstoldonright; /* first item on new right page */
+ OffsetNumber firstrightoff; /* first origpage item on rightpage */
bool newitemonleft; /* new item goes on left, or right? */
} SplitPoint;
@@ -46,7 +46,7 @@ typedef struct
{
/* context data for _bt_recsplitloc */
Relation rel; /* index relation */
- Page page; /* page undergoing split */
+ Page origpage; /* page undergoing split */
IndexTuple newitem; /* new item (cause of page split) */
Size newitemsz; /* size of newitem (includes line pointer) */
bool is_leaf; /* T if splitting a leaf page */
@@ -55,7 +55,7 @@ typedef struct
int leftspace; /* space available for items on left page */
int rightspace; /* space available for items on right page */
int olddataitemstotal; /* space taken by old items */
- Size minfirstrightsz; /* smallest firstoldonright tuple size */
+ Size minfirstrightsz; /* smallest firstright size */
/* candidate split point data */
int maxsplits; /* maximum number of splits */
@@ -65,8 +65,9 @@ typedef struct
} FindSplitData;
static void _bt_recsplitloc(FindSplitData *state,
- OffsetNumber firstoldonright, bool newitemonleft,
- int olddataitemstoleft, Size firstoldonrightsz);
+ OffsetNumber firstrightoff, bool newitemonleft,
+ int olddataitemstoleft,
+ Size firstrightofforigpagetuplesz);
static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult,
bool usemult);
static int _bt_splitcmp(const void *arg1, const void *arg2);
@@ -119,13 +120,18 @@ static inline IndexTuple _bt_split_firstright(FindSplitData *state,
* suffix truncation.
*
* We return the index of the first existing tuple that should go on the
- * righthand page, plus a boolean indicating whether the new tuple goes on
- * the left or right page. The bool is necessary to disambiguate the case
- * where firstright == newitemoff.
+ * righthand page (which is called firstrightoff), plus a boolean
+ * indicating whether the new tuple goes on the left or right page. You
+ * can think of the returned state as a point _between_ two adjacent data
+ * items (laftleft and firstright data items) on an imaginary version of
+ * origpage that already includes newitem. The bool is necessary to
+ * disambiguate the case where firstrightoff == newitemoff (i.e. it is
+ * sometimes needed to determine if the firstright tuple for the split is
+ * newitem rather than the tuple from origpage at offset firstrightoff).
*/
OffsetNumber
_bt_findsplitloc(Relation rel,
- Page page,
+ Page origpage,
OffsetNumber newitemoff,
Size newitemsz,
IndexTuple newitem,
@@ -143,36 +149,36 @@ _bt_findsplitloc(Relation rel,
ItemId itemid;
OffsetNumber offnum,
maxoff,
- foundfirstright;
+ firstrightoff;
double fillfactormult;
bool usemult;
SplitPoint leftpage,
rightpage;
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- maxoff = PageGetMaxOffsetNumber(page);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
+ maxoff = PageGetMaxOffsetNumber(origpage);
/* Total free space available on a btree page, after fixed overhead */
leftspace = rightspace =
- PageGetPageSize(page) - SizeOfPageHeaderData -
+ PageGetPageSize(origpage) - SizeOfPageHeaderData -
MAXALIGN(sizeof(BTPageOpaqueData));
/* The right page will have the same high key as the old page */
if (!P_RIGHTMOST(opaque))
{
- itemid = PageGetItemId(page, P_HIKEY);
+ itemid = PageGetItemId(origpage, P_HIKEY);
rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
sizeof(ItemIdData));
}
/* Count up total space in data items before actually scanning 'em */
- olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
+ olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage);
leaffillfactor = BTGetFillFactor(rel);
/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
newitemsz += sizeof(ItemIdData);
state.rel = rel;
- state.page = page;
+ state.origpage = origpage;
state.newitem = newitem;
state.newitemsz = newitemsz;
state.is_leaf = P_ISLEAF(opaque);
@@ -209,7 +215,7 @@ _bt_findsplitloc(Relation rel,
{
Size itemsz;
- itemid = PageGetItemId(page, offnum);
+ itemid = PageGetItemId(origpage, offnum);
itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
/*
@@ -293,8 +299,7 @@ _bt_findsplitloc(Relation rel,
* New item inserted at rightmost point among a localized grouping on
* a leaf page -- apply "split after new item" optimization, either by
* applying leaf fillfactor multiplier, or by choosing the exact split
- * point that leaves the new item as last on the left. (usemult is set
- * for us.)
+ * point that leaves newitem as lastleft. (usemult is set for us.)
*/
if (usemult)
{
@@ -309,7 +314,7 @@ _bt_findsplitloc(Relation rel,
SplitPoint *split = state.splits + i;
if (split->newitemonleft &&
- newitemoff == split->firstoldonright)
+ newitemoff == split->firstrightoff)
{
pfree(state.splits);
*newitemonleft = true;
@@ -429,24 +434,26 @@ _bt_findsplitloc(Relation rel,
* the entry that has the lowest penalty, and is therefore expected to
* maximize fan-out. Sets *newitemonleft for us.
*/
- foundfirstright = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
- strategy);
+ firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
+ strategy);
pfree(state.splits);
- return foundfirstright;
+ return firstrightoff;
}
/*
* Subroutine to record a particular point between two tuples (possibly the
- * new item) on page (ie, combination of firstright and newitemonleft
- * settings) in *state for later analysis. This is also a convenient point
- * to check if the split is legal (if it isn't, it won't be recorded).
+ * new item) on page (ie, combination of firstrightoff and newitemonleft
+ * settings) in *state for later analysis. This is also a convenient point to
+ * check if the split is legal (if it isn't, it won't be recorded).
*
- * firstoldonright is the offset of the first item on the original page that
- * goes to the right page, and firstoldonrightsz is the size of that tuple.
- * firstoldonright can be > max offset, which means that all the old items go
- * to the left page and only the new item goes to the right page. In that
- * case, firstoldonrightsz is not used.
+ * firstrightoff is the offset of the first item on the original page that
+ * goes to the right page, and firstrightofforigpagetuplesz is the size of
+ * that tuple. firstrightoff can be > max offset, which means that all the
+ * old items go to the left page and only the new item goes to the right page.
+ * We don't actually use firstrightofforigpagetuplesz in that case (actually,
+ * we don't use it for _any_ split where the firstright tuple happens to be
+ * newitem).
*
* olddataitemstoleft is the total size of all old items to the left of the
* split point that is recorded here when legal. Should not include
@@ -454,41 +461,44 @@ _bt_findsplitloc(Relation rel,
*/
static void
_bt_recsplitloc(FindSplitData *state,
- OffsetNumber firstoldonright,
+ OffsetNumber firstrightoff,
bool newitemonleft,
int olddataitemstoleft,
- Size firstoldonrightsz)
+ Size firstrightofforigpagetuplesz)
{
int16 leftfree,
rightfree;
- Size firstrightitemsz;
+ Size firstrightsz;
Size postingsz = 0;
- bool newitemisfirstonright;
+ bool newitemisfirstright;
- /* Is the new item going to be the first item on the right page? */
- newitemisfirstonright = (firstoldonright == state->newitemoff
- && !newitemonleft);
+ /* Is the new item going to be split point's firstright tuple? */
+ newitemisfirstright = (firstrightoff == state->newitemoff &&
+ !newitemonleft);
- if (newitemisfirstonright)
- firstrightitemsz = state->newitemsz;
+ if (newitemisfirstright)
+ firstrightsz = state->newitemsz;
else
{
- firstrightitemsz = firstoldonrightsz;
+ firstrightsz = firstrightofforigpagetuplesz;
/*
- * Calculate suffix truncation space saving when firstright is a
- * posting list tuple, though only when the firstright is over 64
- * bytes including line pointer overhead (arbitrary). This avoids
- * accessing the tuple in cases where its posting list must be very
- * small (if firstright has one at all).
+ * Calculate suffix truncation space saving when firstright tuple is a
+ * posting list tuple, though only when the tuple is over 64 bytes
+ * including line pointer overhead (arbitrary). This avoids accessing
+ * the tuple in cases where its posting list must be very small (if
+ * tuple has one at all).
+ *
+ * Note: We don't do this in the case where firstright tuple is
+ * newitem, since newitem cannot have a posting list.
*/
- if (state->is_leaf && firstrightitemsz > 64)
+ if (state->is_leaf && firstrightsz > 64)
{
ItemId itemid;
IndexTuple newhighkey;
- itemid = PageGetItemId(state->page, firstoldonright);
- newhighkey = (IndexTuple) PageGetItem(state->page, itemid);
+ itemid = PageGetItemId(state->origpage, firstrightoff);
+ newhighkey = (IndexTuple) PageGetItem(state->origpage, itemid);
if (BTreeTupleIsPosting(newhighkey))
postingsz = IndexTupleSize(newhighkey) -
@@ -525,11 +535,11 @@ _bt_recsplitloc(FindSplitData *state,
* precise there noticeably improves the balance of free space.)
*/
if (state->is_leaf)
- leftfree -= (int16) (firstrightitemsz +
+ leftfree -= (int16) (firstrightsz +
MAXALIGN(sizeof(ItemPointerData)) -
postingsz);
else
- leftfree -= (int16) firstrightitemsz;
+ leftfree -= (int16) firstrightsz;
/* account for the new item */
if (newitemonleft)
@@ -542,7 +552,7 @@ _bt_recsplitloc(FindSplitData *state,
* data from the first item that winds up on the right page.
*/
if (!state->is_leaf)
- rightfree += (int16) firstrightitemsz -
+ rightfree += (int16) firstrightsz -
(int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
/* Record split if legal */
@@ -550,13 +560,13 @@ _bt_recsplitloc(FindSplitData *state,
{
Assert(state->nsplits < state->maxsplits);
- /* Determine smallest firstright item size on page */
- state->minfirstrightsz = Min(state->minfirstrightsz, firstrightitemsz);
+ /* Determine smallest firstright tuple size among legal splits */
+ state->minfirstrightsz = Min(state->minfirstrightsz, firstrightsz);
state->splits[state->nsplits].curdelta = 0;
state->splits[state->nsplits].leftfree = leftfree;
state->splits[state->nsplits].rightfree = rightfree;
- state->splits[state->nsplits].firstoldonright = firstoldonright;
+ state->splits[state->nsplits].firstrightoff = firstrightoff;
state->splits[state->nsplits].newitemonleft = newitemonleft;
state->nsplits++;
}
@@ -632,8 +642,8 @@ _bt_splitcmp(const void *arg1, const void *arg2)
* taken by caller varies. Caller uses original leaf page fillfactor in
* standard way rather than using the new item offset directly when *usemult
* was also set to true here. Otherwise, caller applies optimization by
- * locating the legal split point that makes the new tuple the very last tuple
- * on the left side of the split.
+ * locating the legal split point that makes the new tuple the lastleft tuple
+ * for the split.
*/
static bool
_bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
@@ -694,8 +704,8 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
*/
if (state->newitemoff > maxoff)
{
- itemid = PageGetItemId(state->page, maxoff);
- tup = (IndexTuple) PageGetItem(state->page, itemid);
+ itemid = PageGetItemId(state->origpage, maxoff);
+ tup = (IndexTuple) PageGetItem(state->origpage, itemid);
keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
if (keepnatts > 1 && keepnatts <= nkeyatts)
@@ -720,8 +730,8 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
* optimization. Besides, all inappropriate cases triggered here will
* still split in the middle of the page on average.
*/
- itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff));
- tup = (IndexTuple) PageGetItem(state->page, itemid);
+ itemid = PageGetItemId(state->origpage, OffsetNumberPrev(state->newitemoff));
+ tup = (IndexTuple) PageGetItem(state->origpage, itemid);
/* Do cheaper test first */
if (BTreeTupleIsPosting(tup) ||
!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
@@ -843,8 +853,8 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
* leftmost page.
*/
if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost &&
- !final->newitemonleft && final->firstoldonright >= state->newitemoff &&
- final->firstoldonright < state->newitemoff + MAX_LEAF_INTERVAL)
+ !final->newitemonleft && final->firstrightoff >= state->newitemoff &&
+ final->firstrightoff < state->newitemoff + MAX_LEAF_INTERVAL)
{
/*
* Avoid the problem by performing a 50:50 split when the new item is
@@ -854,7 +864,7 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
}
*newitemonleft = final->newitemonleft;
- return final->firstoldonright;
+ return final->firstrightoff;
}
/*
@@ -883,10 +893,11 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
*strategy = SPLIT_DEFAULT;
/*
- * Use smallest observed first right item size for entire page as perfect
+ * Use smallest observed firstright item size for entire page (actually,
+ * entire imaginary version of page that includes newitem) as perfect
* penalty on internal pages. This can save cycles in the common case
- * where most or all splits (not just splits within interval) have first
- * right tuples that are the same size.
+ * where most or all splits (not just splits within interval) have
+ * firstright tuples that are the same size.
*/
if (!state->is_leaf)
return state->minfirstrightsz;
@@ -961,8 +972,8 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
ItemId itemid;
IndexTuple hikey;
- itemid = PageGetItemId(state->page, P_HIKEY);
- hikey = (IndexTuple) PageGetItem(state->page, itemid);
+ itemid = PageGetItemId(state->origpage, P_HIKEY);
+ hikey = (IndexTuple) PageGetItem(state->origpage, itemid);
perfectpenalty = _bt_keep_natts_fast(state->rel, hikey,
state->newitem);
if (perfectpenalty <= indnkeyatts)
@@ -1005,12 +1016,12 @@ _bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval,
{
SplitPoint *distant = state->splits + i;
- if (distant->firstoldonright < deltaoptimal->firstoldonright)
+ if (distant->firstrightoff < deltaoptimal->firstrightoff)
{
if (*leftinterval == NULL)
*leftinterval = distant;
}
- else if (distant->firstoldonright > deltaoptimal->firstoldonright)
+ else if (distant->firstrightoff > deltaoptimal->firstrightoff)
{
if (*rightinterval == NULL)
*rightinterval = distant;
@@ -1018,22 +1029,20 @@ _bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval,
else if (!distant->newitemonleft && deltaoptimal->newitemonleft)
{
/*
- * "incoming tuple will become first on right page" (distant) is
- * to the left of "incoming tuple will become last on left page"
- * (delta-optimal)
+ * "incoming tuple will become firstright" (distant) is to the
+ * left of "incoming tuple will become lastleft" (delta-optimal)
*/
- Assert(distant->firstoldonright == state->newitemoff);
+ Assert(distant->firstrightoff == state->newitemoff);
if (*leftinterval == NULL)
*leftinterval = distant;
}
else if (distant->newitemonleft && !deltaoptimal->newitemonleft)
{
/*
- * "incoming tuple will become last on left page" (distant) is to
- * the right of "incoming tuple will become first on right page"
- * (delta-optimal)
+ * "incoming tuple will become lastleft" (distant) is to the right
+ * of "incoming tuple will become firstright" (delta-optimal)
*/
- Assert(distant->firstoldonright == state->newitemoff);
+ Assert(distant->firstrightoff == state->newitemoff);
if (*rightinterval == NULL)
*rightinterval = distant;
}
@@ -1062,34 +1071,33 @@ _bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval,
* key for left page. It can be greater than the number of key attributes in
* cases where a heap TID will need to be appended during truncation.
*
- * On internal pages, penalty is simply the size of the first item on the
- * right half of the split (including line pointer overhead). This tuple will
- * become the new high key for the left page.
+ * On internal pages, penalty is simply the size of the firstright tuple for
+ * the split (including line pointer overhead). This tuple will become the
+ * new high key for the left page.
*/
static inline int
_bt_split_penalty(FindSplitData *state, SplitPoint *split)
{
- IndexTuple lastleftuple;
- IndexTuple firstrighttuple;
+ IndexTuple lastleft;
+ IndexTuple firstright;
if (!state->is_leaf)
{
ItemId itemid;
if (!split->newitemonleft &&
- split->firstoldonright == state->newitemoff)
+ split->firstrightoff == state->newitemoff)
return state->newitemsz;
- itemid = PageGetItemId(state->page, split->firstoldonright);
+ itemid = PageGetItemId(state->origpage, split->firstrightoff);
return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
}
- lastleftuple = _bt_split_lastleft(state, split);
- firstrighttuple = _bt_split_firstright(state, split);
+ lastleft = _bt_split_lastleft(state, split);
+ firstright = _bt_split_firstright(state, split);
- Assert(lastleftuple != firstrighttuple);
- return _bt_keep_natts_fast(state->rel, lastleftuple, firstrighttuple);
+ return _bt_keep_natts_fast(state->rel, lastleft, firstright);
}
/*
@@ -1100,12 +1108,12 @@ _bt_split_lastleft(FindSplitData *state, SplitPoint *split)
{
ItemId itemid;
- if (split->newitemonleft && split->firstoldonright == state->newitemoff)
+ if (split->newitemonleft && split->firstrightoff == state->newitemoff)
return state->newitem;
- itemid = PageGetItemId(state->page,
- OffsetNumberPrev(split->firstoldonright));
- return (IndexTuple) PageGetItem(state->page, itemid);
+ itemid = PageGetItemId(state->origpage,
+ OffsetNumberPrev(split->firstrightoff));
+ return (IndexTuple) PageGetItem(state->origpage, itemid);
}
/*
@@ -1116,9 +1124,9 @@ _bt_split_firstright(FindSplitData *state, SplitPoint *split)
{
ItemId itemid;
- if (!split->newitemonleft && split->firstoldonright == state->newitemoff)
+ if (!split->newitemonleft && split->firstrightoff == state->newitemoff)
return state->newitem;
- itemid = PageGetItemId(state->page, split->firstoldonright);
- return (IndexTuple) PageGetItem(state->page, itemid);
+ itemid = PageGetItemId(state->origpage, split->firstrightoff);
+ return (IndexTuple) PageGetItem(state->origpage, itemid);
}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 08d7201f196..ce48a51640a 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -2346,17 +2346,12 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
ScanKey scankey;
/*
- * Be consistent about the representation of BTREE_VERSION 2/3 tuples
- * across Postgres versions; don't allow new pivot tuples to have
- * truncated key attributes there. _bt_compare() treats truncated key
- * attributes as having the value minus infinity, which would break
- * searches within !heapkeyspace indexes.
+ * _bt_compare() treats truncated key attributes as having the value minus
+ * infinity, which would break searches within !heapkeyspace indexes. We
+ * must still truncate away non-key attribute values, though.
*/
if (!itup_key->heapkeyspace)
- {
- Assert(nkeyatts != IndexRelationGetNumberOfAttributes(rel));
return nkeyatts;
- }
scankey = itup_key->scankeys;
keepnatts = 1;
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 99d0914e724..493931e1b83 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -251,7 +251,7 @@ btree_xlog_insert(bool isleaf, bool ismeta, bool posting,
}
static void
-btree_xlog_split(bool onleft, XLogReaderState *record)
+btree_xlog_split(bool newitemonleft, XLogReaderState *record)
{
XLogRecPtr lsn = record->EndRecPtr;
xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
@@ -323,7 +323,7 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
datapos = XLogRecGetBlockData(record, 0, &datalen);
- if (onleft || xlrec->postingoff != 0)
+ if (newitemonleft || xlrec->postingoff != 0)
{
newitem = (IndexTuple) datapos;
newitemsz = MAXALIGN(IndexTupleSize(newitem));
@@ -368,7 +368,7 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
elog(PANIC, "failed to add high key to left page after split");
leftoff = OffsetNumberNext(leftoff);
- for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstright; off++)
+ for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstrightoff; off++)
{
ItemId itemid;
Size itemsz;
@@ -377,7 +377,8 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
/* Add replacement posting list when required */
if (off == replacepostingoff)
{
- Assert(onleft || xlrec->firstright == xlrec->newitemoff);
+ Assert(newitemonleft ||
+ xlrec->firstrightoff == xlrec->newitemoff);
if (PageAddItem(newlpage, (Item) nposting,
MAXALIGN(IndexTupleSize(nposting)), leftoff,
false, false) == InvalidOffsetNumber)
@@ -387,7 +388,7 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
}
/* add the new item if it was inserted on left page */
- else if (onleft && off == xlrec->newitemoff)
+ else if (newitemonleft && off == xlrec->newitemoff)
{
if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff,
false, false) == InvalidOffsetNumber)
@@ -405,7 +406,7 @@ btree_xlog_split(bool onleft, XLogReaderState *record)
}
/* cope with possibility that newitem goes at the end */
- if (onleft && off == xlrec->newitemoff)
+ if (newitemonleft && off == xlrec->newitemoff)
{
if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff,
false, false) == InvalidOffsetNumber)
diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c
index 7a1616f371c..e099107f91e 100644
--- a/src/backend/access/rmgrdesc/nbtdesc.c
+++ b/src/backend/access/rmgrdesc/nbtdesc.c
@@ -39,8 +39,8 @@ btree_desc(StringInfo buf, XLogReaderState *record)
{
xl_btree_split *xlrec = (xl_btree_split *) rec;
- appendStringInfo(buf, "level %u, firstright %d, newitemoff %d, postingoff %d",
- xlrec->level, xlrec->firstright,
+ appendStringInfo(buf, "level %u, firstrightoff %d, newitemoff %d, postingoff %d",
+ xlrec->level, xlrec->firstrightoff,
xlrec->newitemoff, xlrec->postingoff);
break;
}
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index 347976c532c..ec7b2a6c33e 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -99,9 +99,9 @@ typedef struct xl_btree_insert
* left or right split page (and thus, whether the new item is stored or not).
* We always log the left page high key because suffix truncation can generate
* a new leaf high key using user-defined code. This is also necessary on
- * internal pages, since the first right item that the left page's high key
- * was based on will have been truncated to zero attributes in the right page
- * (the original is unavailable from the right page).
+ * internal pages, since the firstright item that the left page's high key was
+ * based on will have been truncated to zero attributes in the right page (the
+ * separator key is unavailable from the right page).
*
* Backup Blk 0: original page / new left page
*
@@ -153,7 +153,7 @@ typedef struct xl_btree_insert
typedef struct xl_btree_split
{
uint32 level; /* tree level of page being split */
- OffsetNumber firstright; /* first item moved to right page */
+ OffsetNumber firstrightoff; /* first origpage item on rightpage */
OffsetNumber newitemoff; /* new item's offset */
uint16 postingoff; /* offset inside orig posting tuple */
} xl_btree_split;