aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtsort.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/nbtree/nbtsort.c')
-rw-r--r--src/backend/access/nbtree/nbtsort.c272
1 files changed, 116 insertions, 156 deletions
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 458abe7754c..1981f554693 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -6,8 +6,12 @@
*
* We use tuplesort.c to sort the given index tuples into order.
* Then we scan the index tuples in order and build the btree pages
- * for each level. When we have only one page on a level, it must be the
- * root -- it can be attached to the btree metapage and we are done.
+ * for each level. We load source tuples into leaf-level pages.
+ * Whenever we fill a page at one level, we add a link to it to its
+ * parent level (starting a new parent level if necessary). When
+ * done, we write out each final page on each level, adding it to
+ * its parent level. When we have only one page on a level, it must be
+ * the root -- it can be attached to the btree metapage and we are done.
*
* this code is moderately slow (~10% slower) compared to the regular
* btree (insertion) build code on sorted or well-clustered data. on
@@ -23,12 +27,20 @@
* something like the standard 70% steady-state load factor for btrees
* would probably be better.
*
+ * Another limitation is that we currently load full copies of all keys
+ * into upper tree levels. The leftmost data key in each non-leaf node
+ * could be omitted as far as normal btree operations are concerned
+ * (see README for more info). However, because we build the tree from
+ * the bottom up, we need that data key to insert into the node's parent.
+ * This could be fixed by keeping a spare copy of the minimum key in the
+ * state stack, but I haven't time for that right now.
+ *
*
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.54 2000/06/15 04:09:36 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.55 2000/07/21 06:42:33 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -57,6 +69,20 @@ struct BTSpool
bool isunique;
};
+/*
+ * Status record for a btree page being built. We have one of these
+ * for each active tree level.
+ */
+typedef struct BTPageState
+{
+ Buffer btps_buf; /* current buffer & page */
+ Page btps_page;
+ OffsetNumber btps_lastoff; /* last item offset loaded */
+ int btps_level;
+ struct BTPageState *btps_next; /* link to parent level, if any */
+} BTPageState;
+
+
#define BTITEMSZ(btitem) \
((btitem) ? \
(IndexTupleDSize((btitem)->bti_itup) + \
@@ -65,13 +91,11 @@ struct BTSpool
static void _bt_load(Relation index, BTSpool *btspool);
-static BTItem _bt_buildadd(Relation index, Size keysz, ScanKey scankey,
- BTPageState *state, BTItem bti, int flags);
+static void _bt_buildadd(Relation index, BTPageState *state,
+ BTItem bti, int flags);
static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend);
-static BTPageState *_bt_pagestate(Relation index, int flags,
- int level, bool doupper);
-static void _bt_uppershutdown(Relation index, Size keysz, ScanKey scankey,
- BTPageState *state);
+static BTPageState *_bt_pagestate(Relation index, int flags, int level);
+static void _bt_uppershutdown(Relation index, BTPageState *state);
/*
@@ -159,9 +183,6 @@ _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags)
BTPageOpaque opaque;
*buf = _bt_getbuf(index, P_NEW, BT_WRITE);
-#ifdef NOT_USED
- printf("\tblk=%d\n", BufferGetBlockNumber(*buf));
-#endif
*page = BufferGetPage(*buf);
_bt_pageinit(*page, BufferGetPageSize(*buf));
opaque = (BTPageOpaque) PageGetSpecialPointer(*page);
@@ -202,18 +223,15 @@ _bt_slideleft(Relation index, Buffer buf, Page page)
* is suitable for immediate use by _bt_buildadd.
*/
static BTPageState *
-_bt_pagestate(Relation index, int flags, int level, bool doupper)
+_bt_pagestate(Relation index, int flags, int level)
{
BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState));
MemSet((char *) state, 0, sizeof(BTPageState));
_bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags);
- state->btps_firstoff = InvalidOffsetNumber;
state->btps_lastoff = P_HIKEY;
- state->btps_lastbti = (BTItem) NULL;
state->btps_next = (BTPageState *) NULL;
state->btps_level = level;
- state->btps_doupper = doupper;
return state;
}
@@ -240,31 +258,27 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend)
}
/*
- * add an item to a disk page from a merge tape block.
+ * add an item to a disk page from the sort output.
*
* we must be careful to observe the following restrictions, placed
* upon us by the conventions in nbtsearch.c:
* - rightmost pages start data items at P_HIKEY instead of at
* P_FIRSTKEY.
- * - duplicates cannot be split among pages unless the chain of
- * duplicates starts at the first data item.
*
* a leaf page being built looks like:
*
* +----------------+---------------------------------+
* | PageHeaderData | linp0 linp1 linp2 ... |
* +-----------+----+---------------------------------+
- * | ... linpN | ^ first |
+ * | ... linpN | |
* +-----------+--------------------------------------+
* | ^ last |
* | |
- * | v last |
* +-------------+------------------------------------+
* | | itemN ... |
* +-------------+------------------+-----------------+
* | ... item3 item2 item1 | "special space" |
* +--------------------------------+-----------------+
- * ^ first
*
* contrast this with the diagram in bufpage.h; note the mismatch
* between linps and items. this is because we reserve linp0 as a
@@ -272,30 +286,20 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend)
* filled up the page, we will set linp0 to point to itemN and clear
* linpN.
*
- * 'last' pointers indicate the last offset/item added to the page.
- * 'first' pointers indicate the first offset/item that is part of a
- * chain of duplicates extending from 'first' to 'last'.
- *
- * if all keys are unique, 'first' will always be the same as 'last'.
+ * 'last' pointer indicates the last offset added to the page.
*/
-static BTItem
-_bt_buildadd(Relation index, Size keysz, ScanKey scankey,
- BTPageState *state, BTItem bti, int flags)
+static void
+_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags)
{
Buffer nbuf;
Page npage;
- BTItem last_bti;
- OffsetNumber first_off;
OffsetNumber last_off;
- OffsetNumber off;
Size pgspc;
Size btisz;
nbuf = state->btps_buf;
npage = state->btps_page;
- first_off = state->btps_firstoff;
last_off = state->btps_lastoff;
- last_bti = state->btps_lastbti;
pgspc = PageGetFreeSpace(npage);
btisz = BTITEMSZ(bti);
@@ -319,75 +323,55 @@ _bt_buildadd(Relation index, Size keysz, ScanKey scankey,
if (pgspc < btisz)
{
+ /*
+ * Item won't fit on this page, so finish off the page and
+ * write it out.
+ */
Buffer obuf = nbuf;
Page opage = npage;
- OffsetNumber o,
- n;
ItemId ii;
ItemId hii;
+ BTItem nbti;
_bt_blnewpage(index, &nbuf, &npage, flags);
/*
- * if 'last' is part of a chain of duplicates that does not start
- * at the beginning of the old page, the entire chain is copied to
- * the new page; we delete all of the duplicates from the old page
- * except the first, which becomes the high key item of the old
- * page.
+ * We copy the last item on the page into the new page, and then
+ * rearrange the old page so that the 'last item' becomes its high
+ * key rather than a true data item.
*
- * if the chain starts at the beginning of the page or there is no
- * chain ('first' == 'last'), we need only copy 'last' to the new
- * page. again, 'first' (== 'last') becomes the high key of the
- * old page.
- *
- * note that in either case, we copy at least one item to the new
- * page, so 'last_bti' will always be valid. 'bti' will never be
- * the first data item on the new page.
+ * note that since we always copy an item to the new page,
+ * 'bti' will never be the first data item on the new page.
*/
- if (first_off == P_FIRSTKEY)
+ ii = PageGetItemId(opage, last_off);
+ if (PageAddItem(npage, PageGetItem(opage, ii), ii->lp_len,
+ P_FIRSTKEY, LP_USED) == InvalidOffsetNumber)
+ elog(FATAL, "btree: failed to add item to the page in _bt_sort (1)");
+#ifdef FASTBUILD_DEBUG
{
- Assert(last_off != P_FIRSTKEY);
- first_off = last_off;
+ bool isnull;
+ BTItem tmpbti =
+ (BTItem) PageGetItem(npage, PageGetItemId(npage, P_FIRSTKEY));
+ Datum d = index_getattr(&(tmpbti->bti_itup), 1,
+ index->rd_att, &isnull);
+
+ printf("_bt_buildadd: moved <%x> to offset %d at level %d\n",
+ d, P_FIRSTKEY, state->btps_level);
}
- for (o = first_off, n = P_FIRSTKEY;
- o <= last_off;
- o = OffsetNumberNext(o), n = OffsetNumberNext(n))
- {
- ii = PageGetItemId(opage, o);
- if (PageAddItem(npage, PageGetItem(opage, ii),
- ii->lp_len, n, LP_USED) == InvalidOffsetNumber)
- elog(FATAL, "btree: failed to add item to the page in _bt_sort (1)");
-#ifdef FASTBUILD_DEBUG
- {
- bool isnull;
- BTItem tmpbti =
- (BTItem) PageGetItem(npage, PageGetItemId(npage, n));
- Datum d = index_getattr(&(tmpbti->bti_itup), 1,
- index->rd_att, &isnull);
-
- printf("_bt_buildadd: moved <%x> to offset %d at level %d\n",
- d, n, state->btps_level);
- }
#endif
- }
/*
- * this loop is backward because PageIndexTupleDelete shuffles the
- * tuples to fill holes in the page -- by starting at the end and
- * working back, we won't create holes (and thereby avoid
- * shuffling).
+ * Move 'last' into the high key position on opage
*/
- for (o = last_off; o > first_off; o = OffsetNumberPrev(o))
- PageIndexTupleDelete(opage, o);
hii = PageGetItemId(opage, P_HIKEY);
- ii = PageGetItemId(opage, first_off);
*hii = *ii;
ii->lp_flags &= ~LP_USED;
((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
- first_off = P_FIRSTKEY;
+ /*
+ * Reset last_off to point to new page
+ */
last_off = PageGetMaxOffsetNumber(npage);
- last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, last_off));
/*
* set the page (side link) pointers.
@@ -399,32 +383,21 @@ _bt_buildadd(Relation index, Size keysz, ScanKey scankey,
oopaque->btpo_next = BufferGetBlockNumber(nbuf);
nopaque->btpo_prev = BufferGetBlockNumber(obuf);
nopaque->btpo_next = P_NONE;
-
- if (_bt_itemcmp(index, keysz, scankey,
- (BTItem) PageGetItem(opage, PageGetItemId(opage, P_HIKEY)),
- (BTItem) PageGetItem(opage, PageGetItemId(opage, P_FIRSTKEY)),
- BTEqualStrategyNumber))
- oopaque->btpo_flags |= BTP_CHAIN;
}
/*
- * copy the old buffer's minimum key to its parent. if we don't
- * have a parent, we have to create one; this adds a new btree
- * level.
+ * Link the old buffer into its parent, using its minimum key.
+ * If we don't have a parent, we have to create one;
+ * this adds a new btree level.
*/
- if (state->btps_doupper)
+ if (state->btps_next == (BTPageState *) NULL)
{
- BTItem nbti;
-
- if (state->btps_next == (BTPageState *) NULL)
- {
- state->btps_next =
- _bt_pagestate(index, 0, state->btps_level + 1, true);
- }
- nbti = _bt_minitem(opage, BufferGetBlockNumber(obuf), 0);
- _bt_buildadd(index, keysz, scankey, state->btps_next, nbti, 0);
- pfree((void *) nbti);
+ state->btps_next =
+ _bt_pagestate(index, 0, state->btps_level + 1);
}
+ nbti = _bt_minitem(opage, BufferGetBlockNumber(obuf), 0);
+ _bt_buildadd(index, state->btps_next, nbti, 0);
+ pfree((void *) nbti);
/*
* write out the old stuff. we never want to see it again, so we
@@ -435,11 +408,11 @@ _bt_buildadd(Relation index, Size keysz, ScanKey scankey,
}
/*
- * if this item is different from the last item added, we start a new
- * chain of duplicates.
+ * Add the new item into the current page.
*/
- off = OffsetNumberNext(last_off);
- if (PageAddItem(npage, (Item) bti, btisz, off, LP_USED) == InvalidOffsetNumber)
+ last_off = OffsetNumberNext(last_off);
+ if (PageAddItem(npage, (Item) bti, btisz,
+ last_off, LP_USED) == InvalidOffsetNumber)
elog(FATAL, "btree: failed to add item to the page in _bt_sort (2)");
#ifdef FASTBUILD_DEBUG
{
@@ -447,65 +420,57 @@ _bt_buildadd(Relation index, Size keysz, ScanKey scankey,
Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull);
printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n",
- d, off, state->btps_level);
+ d, last_off, state->btps_level);
}
#endif
- if (last_bti == (BTItem) NULL)
- first_off = P_FIRSTKEY;
- else if (!_bt_itemcmp(index, keysz, scankey,
- bti, last_bti, BTEqualStrategyNumber))
- first_off = off;
- last_off = off;
- last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, off));
state->btps_buf = nbuf;
state->btps_page = npage;
- state->btps_lastbti = last_bti;
state->btps_lastoff = last_off;
- state->btps_firstoff = first_off;
-
- return last_bti;
}
+/*
+ * Finish writing out the completed btree.
+ */
static void
-_bt_uppershutdown(Relation index, Size keysz, ScanKey scankey,
- BTPageState *state)
+_bt_uppershutdown(Relation index, BTPageState *state)
{
BTPageState *s;
BlockNumber blkno;
BTPageOpaque opaque;
BTItem bti;
+ /*
+ * Each iteration of this loop completes one more level of the tree.
+ */
for (s = state; s != (BTPageState *) NULL; s = s->btps_next)
{
blkno = BufferGetBlockNumber(s->btps_buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
/*
- * if this is the root, attach it to the metapage. otherwise,
- * stick the minimum key of the last page on this level (which has
- * not been split, or else it wouldn't be the last page) into its
- * parent. this may cause the last page of upper levels to split,
- * but that's not a problem -- we haven't gotten to them yet.
+ * We have to link the last page on this level to somewhere.
+ *
+ * If we're at the top, it's the root, so attach it to the metapage.
+ * Otherwise, add an entry for it to its parent using its minimum
+ * key. This may cause the last page of the parent level to split,
+ * but that's not a problem -- we haven't gotten to it yet.
*/
- if (s->btps_doupper)
+ if (s->btps_next == (BTPageState *) NULL)
{
- if (s->btps_next == (BTPageState *) NULL)
- {
- opaque->btpo_flags |= BTP_ROOT;
- _bt_metaproot(index, blkno, s->btps_level + 1);
- }
- else
- {
- bti = _bt_minitem(s->btps_page, blkno, 0);
- _bt_buildadd(index, keysz, scankey, s->btps_next, bti, 0);
- pfree((void *) bti);
- }
+ opaque->btpo_flags |= BTP_ROOT;
+ _bt_metaproot(index, blkno, s->btps_level + 1);
+ }
+ else
+ {
+ bti = _bt_minitem(s->btps_page, blkno, 0);
+ _bt_buildadd(index, s->btps_next, bti, 0);
+ pfree((void *) bti);
}
/*
- * this is the rightmost page, so the ItemId array needs to be
- * slid back one slot.
+ * This is the rightmost page, so the ItemId array needs to be
+ * slid back one slot. Then we can dump out the page.
*/
_bt_slideleft(index, s->btps_buf, s->btps_page);
_bt_wrtbuf(index, s->btps_buf);
@@ -519,32 +484,27 @@ _bt_uppershutdown(Relation index, Size keysz, ScanKey scankey,
static void
_bt_load(Relation index, BTSpool *btspool)
{
- BTPageState *state;
- ScanKey skey;
- int natts;
- BTItem bti;
- bool should_free;
-
- /*
- * initialize state needed for the merge into the btree leaf pages.
- */
- state = _bt_pagestate(index, BTP_LEAF, 0, true);
-
- skey = _bt_mkscankey_nodata(index);
- natts = RelationGetNumberOfAttributes(index);
+ BTPageState *state = NULL;
for (;;)
{
+ BTItem bti;
+ bool should_free;
+
bti = (BTItem) tuplesort_getindextuple(btspool->sortstate, true,
&should_free);
if (bti == (BTItem) NULL)
break;
- _bt_buildadd(index, natts, skey, state, bti, BTP_LEAF);
+
+ /* When we see first tuple, create first index page */
+ if (state == NULL)
+ state = _bt_pagestate(index, BTP_LEAF, 0);
+
+ _bt_buildadd(index, state, bti, BTP_LEAF);
if (should_free)
pfree((void *) bti);
}
- _bt_uppershutdown(index, natts, skey, state);
-
- _bt_freeskey(skey);
+ if (state != NULL)
+ _bt_uppershutdown(index, state);
}