aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtsort.c
diff options
context:
space:
mode:
authorPeter Geoghegan <pg@bowt.ie>2020-04-13 16:39:55 -0700
committerPeter Geoghegan <pg@bowt.ie>2020-04-13 16:39:55 -0700
commitbc3087b626d1073c9b7c9687b334785909ca2237 (patch)
tree88911db9b3741a33774d62faad6390ed92779095 /src/backend/access/nbtree/nbtsort.c
parent8f00d84afc0dad577b65df5a313e5306cee3d11f (diff)
downloadpostgresql-bc3087b626d1073c9b7c9687b334785909ca2237.tar.gz
postgresql-bc3087b626d1073c9b7c9687b334785909ca2237.zip
Harmonize nbtree page split point code.
An nbtree split point can be thought of as a point between two adjoining tuples from an imaginary version of the page being split that includes the incoming/new item (in addition to the items that really are on the page). These adjoining tuples are called the lastleft and firstright tuples. The variables that represent split points contained a field called firstright, which is an offset number of the first data item from the original page that goes on the new right page. The corresponding tuple from origpage was usually the same thing as the actual firstright tuple, but not always: the firstright tuple is sometimes the new/incoming item instead. This situation seems unnecessarily confusing. Make things clearer by renaming the origpage offset returned by _bt_findsplitloc() to "firstrightoff". We now have a firstright tuple and a firstrightoff offset number which are comparable to the newitem/lastleft tuples and the newitemoff/lastleftoff offset numbers respectively. Also make sure that we are consistent about how we describe nbtree page split point state. Push the responsibility for dealing with pg_upgrade'd !heapkeyspace indexes down to lower level code, relieving _bt_split() from dealing with it directly. This means that we always have a palloc'd left page high key on the leaf level, no matter what. This enables simplifying some of the code (and code comments) within _bt_split(). Finally, restructure the page split code to make it clearer why suffix truncation (which only takes place during leaf page splits) is completely different to the first data item truncation that takes place during internal page splits. Tuples are marked as having fewer attributes stored in both cases, and the firstright tuple is truncated in both cases, so it's easy to imagine somebody missing the distinction.
Diffstat (limited to 'src/backend/access/nbtree/nbtsort.c')
-rw-r--r--src/backend/access/nbtree/nbtsort.c42
1 files changed, 22 insertions, 20 deletions
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index ba230bbcd2e..15f10a29d3d 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -269,7 +269,8 @@ static Page _bt_blnewpage(uint32 level);
static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
static void _bt_slideleft(Page page);
static void _bt_sortaddtup(Page page, Size itemsize,
- IndexTuple itup, OffsetNumber itup_off);
+ IndexTuple itup, OffsetNumber itup_off,
+ bool newfirstdataitem);
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
IndexTuple itup, Size truncextra);
static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
@@ -750,26 +751,24 @@ _bt_slideleft(Page page)
/*
* Add an item to a page being built.
*
- * The main difference between this routine and a bare PageAddItem call
- * is that this code knows that the leftmost data item on a non-leaf btree
- * page has a key that must be treated as minus infinity. Therefore, it
- * truncates away all attributes.
+ * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
+ * raises an error directly.
*
- * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use
- * that because it assumes that P_RIGHTMOST() will return the correct
- * answer for the page. Here, we don't know yet if the page will be
- * rightmost. Offset P_FIRSTKEY is always the first data key.
+ * Note that our nbtsort.c caller does not know yet if the page will be
+ * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by
+ * caller. Page that turns out to be the rightmost on its level is fixed by
+ * calling _bt_slideleft().
*/
static void
_bt_sortaddtup(Page page,
Size itemsize,
IndexTuple itup,
- OffsetNumber itup_off)
+ OffsetNumber itup_off,
+ bool newfirstdataitem)
{
- BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
IndexTupleData trunctuple;
- if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY)
+ if (newfirstdataitem)
{
trunctuple = *itup;
trunctuple.t_info = sizeof(IndexTupleData);
@@ -867,12 +866,13 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
* Every newly built index will treat heap TID as part of the keyspace,
* which imposes the requirement that new high keys must occasionally have
* a heap TID appended within _bt_truncate(). That may leave a new pivot
- * tuple one or two MAXALIGN() quantums larger than the original first
- * right tuple it's derived from. v4 deals with the problem by decreasing
- * the limit on the size of tuples inserted on the leaf level by the same
- * small amount. Enforce the new v4+ limit on the leaf level, and the old
- * limit on internal levels, since pivot tuples may need to make use of
- * the reserved space. This should never fail on internal pages.
+ * tuple one or two MAXALIGN() quantums larger than the original
+ * firstright tuple it's derived from. v4 deals with the problem by
+ * decreasing the limit on the size of tuples inserted on the leaf level
+ * by the same small amount. Enforce the new v4+ limit on the leaf level,
+ * and the old limit on internal levels, since pivot tuples may need to
+ * make use of the reserved space. This should never fail on internal
+ * pages.
*/
if (unlikely(itupsz > BTMaxItemSize(npage)))
_bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
@@ -925,7 +925,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
Assert(last_off > P_FIRSTKEY);
ii = PageGetItemId(opage, last_off);
oitup = (IndexTuple) PageGetItem(opage, ii);
- _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY);
+ _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
+ !isleaf);
/*
* Move 'last' into the high key position on opage. _bt_blnewpage()
@@ -1054,7 +1055,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
* Add the new item into the current page.
*/
last_off = OffsetNumberNext(last_off);
- _bt_sortaddtup(npage, itupsz, itup, last_off);
+ _bt_sortaddtup(npage, itupsz, itup, last_off,
+ !isleaf && last_off == P_FIRSTKEY);
state->btps_page = npage;
state->btps_blkno = nblkno;