diff options
Diffstat (limited to 'src/backend/access/nbtree/nbtsort.c')
-rw-r--r-- | src/backend/access/nbtree/nbtsort.c | 272 |
1 files changed, 116 insertions, 156 deletions
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 458abe7754c..1981f554693 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -6,8 +6,12 @@ * * We use tuplesort.c to sort the given index tuples into order. * Then we scan the index tuples in order and build the btree pages - * for each level. When we have only one page on a level, it must be the - * root -- it can be attached to the btree metapage and we are done. + * for each level. We load source tuples into leaf-level pages. + * Whenever we fill a page at one level, we add a link to it to its + * parent level (starting a new parent level if necessary). When + * done, we write out each final page on each level, adding it to + * its parent level. When we have only one page on a level, it must be + * the root -- it can be attached to the btree metapage and we are done. * * this code is moderately slow (~10% slower) compared to the regular * btree (insertion) build code on sorted or well-clustered data. on @@ -23,12 +27,20 @@ * something like the standard 70% steady-state load factor for btrees * would probably be better. * + * Another limitation is that we currently load full copies of all keys + * into upper tree levels. The leftmost data key in each non-leaf node + * could be omitted as far as normal btree operations are concerned + * (see README for more info). However, because we build the tree from + * the bottom up, we need that data key to insert into the node's parent. + * This could be fixed by keeping a spare copy of the minimum key in the + * state stack, but I haven't time for that right now. + * * * Portions Copyright (c) 1996-2000, PostgreSQL, Inc * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.54 2000/06/15 04:09:36 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.55 2000/07/21 06:42:33 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -57,6 +69,20 @@ struct BTSpool bool isunique; }; +/* + * Status record for a btree page being built. We have one of these + * for each active tree level. + */ +typedef struct BTPageState +{ + Buffer btps_buf; /* current buffer & page */ + Page btps_page; + OffsetNumber btps_lastoff; /* last item offset loaded */ + int btps_level; + struct BTPageState *btps_next; /* link to parent level, if any */ +} BTPageState; + + #define BTITEMSZ(btitem) \ ((btitem) ? \ (IndexTupleDSize((btitem)->bti_itup) + \ @@ -65,13 +91,11 @@ struct BTSpool static void _bt_load(Relation index, BTSpool *btspool); -static BTItem _bt_buildadd(Relation index, Size keysz, ScanKey scankey, - BTPageState *state, BTItem bti, int flags); +static void _bt_buildadd(Relation index, BTPageState *state, + BTItem bti, int flags); static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend); -static BTPageState *_bt_pagestate(Relation index, int flags, - int level, bool doupper); -static void _bt_uppershutdown(Relation index, Size keysz, ScanKey scankey, - BTPageState *state); +static BTPageState *_bt_pagestate(Relation index, int flags, int level); +static void _bt_uppershutdown(Relation index, BTPageState *state); /* @@ -159,9 +183,6 @@ _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags) BTPageOpaque opaque; *buf = _bt_getbuf(index, P_NEW, BT_WRITE); -#ifdef NOT_USED - printf("\tblk=%d\n", BufferGetBlockNumber(*buf)); -#endif *page = BufferGetPage(*buf); _bt_pageinit(*page, BufferGetPageSize(*buf)); opaque = (BTPageOpaque) PageGetSpecialPointer(*page); @@ -202,18 +223,15 @@ _bt_slideleft(Relation index, Buffer buf, Page page) * is suitable for immediate use by _bt_buildadd. */ static BTPageState * -_bt_pagestate(Relation index, int flags, int level, bool doupper) +_bt_pagestate(Relation index, int flags, int level) { BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState)); MemSet((char *) state, 0, sizeof(BTPageState)); _bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags); - state->btps_firstoff = InvalidOffsetNumber; state->btps_lastoff = P_HIKEY; - state->btps_lastbti = (BTItem) NULL; state->btps_next = (BTPageState *) NULL; state->btps_level = level; - state->btps_doupper = doupper; return state; } @@ -240,31 +258,27 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend) } /* - * add an item to a disk page from a merge tape block. + * add an item to a disk page from the sort output. * * we must be careful to observe the following restrictions, placed * upon us by the conventions in nbtsearch.c: * - rightmost pages start data items at P_HIKEY instead of at * P_FIRSTKEY. - * - duplicates cannot be split among pages unless the chain of - * duplicates starts at the first data item. * * a leaf page being built looks like: * * +----------------+---------------------------------+ * | PageHeaderData | linp0 linp1 linp2 ... | * +-----------+----+---------------------------------+ - * | ... linpN | ^ first | + * | ... linpN | | * +-----------+--------------------------------------+ * | ^ last | * | | - * | v last | * +-------------+------------------------------------+ * | | itemN ... | * +-------------+------------------+-----------------+ * | ... item3 item2 item1 | "special space" | * +--------------------------------+-----------------+ - * ^ first * * contrast this with the diagram in bufpage.h; note the mismatch * between linps and items. this is because we reserve linp0 as a @@ -272,30 +286,20 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend) * filled up the page, we will set linp0 to point to itemN and clear * linpN. * - * 'last' pointers indicate the last offset/item added to the page. - * 'first' pointers indicate the first offset/item that is part of a - * chain of duplicates extending from 'first' to 'last'. - * - * if all keys are unique, 'first' will always be the same as 'last'. + * 'last' pointer indicates the last offset added to the page. */ -static BTItem -_bt_buildadd(Relation index, Size keysz, ScanKey scankey, - BTPageState *state, BTItem bti, int flags) +static void +_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) { Buffer nbuf; Page npage; - BTItem last_bti; - OffsetNumber first_off; OffsetNumber last_off; - OffsetNumber off; Size pgspc; Size btisz; nbuf = state->btps_buf; npage = state->btps_page; - first_off = state->btps_firstoff; last_off = state->btps_lastoff; - last_bti = state->btps_lastbti; pgspc = PageGetFreeSpace(npage); btisz = BTITEMSZ(bti); @@ -319,75 +323,55 @@ _bt_buildadd(Relation index, Size keysz, ScanKey scankey, if (pgspc < btisz) { + /* + * Item won't fit on this page, so finish off the page and + * write it out. + */ Buffer obuf = nbuf; Page opage = npage; - OffsetNumber o, - n; ItemId ii; ItemId hii; + BTItem nbti; _bt_blnewpage(index, &nbuf, &npage, flags); /* - * if 'last' is part of a chain of duplicates that does not start - * at the beginning of the old page, the entire chain is copied to - * the new page; we delete all of the duplicates from the old page - * except the first, which becomes the high key item of the old - * page. + * We copy the last item on the page into the new page, and then + * rearrange the old page so that the 'last item' becomes its high + * key rather than a true data item. * - * if the chain starts at the beginning of the page or there is no - * chain ('first' == 'last'), we need only copy 'last' to the new - * page. again, 'first' (== 'last') becomes the high key of the - * old page. - * - * note that in either case, we copy at least one item to the new - * page, so 'last_bti' will always be valid. 'bti' will never be - * the first data item on the new page. + * note that since we always copy an item to the new page, + * 'bti' will never be the first data item on the new page. */ - if (first_off == P_FIRSTKEY) + ii = PageGetItemId(opage, last_off); + if (PageAddItem(npage, PageGetItem(opage, ii), ii->lp_len, + P_FIRSTKEY, LP_USED) == InvalidOffsetNumber) + elog(FATAL, "btree: failed to add item to the page in _bt_sort (1)"); +#ifdef FASTBUILD_DEBUG { - Assert(last_off != P_FIRSTKEY); - first_off = last_off; + bool isnull; + BTItem tmpbti = + (BTItem) PageGetItem(npage, PageGetItemId(npage, P_FIRSTKEY)); + Datum d = index_getattr(&(tmpbti->bti_itup), 1, + index->rd_att, &isnull); + + printf("_bt_buildadd: moved <%x> to offset %d at level %d\n", + d, P_FIRSTKEY, state->btps_level); } - for (o = first_off, n = P_FIRSTKEY; - o <= last_off; - o = OffsetNumberNext(o), n = OffsetNumberNext(n)) - { - ii = PageGetItemId(opage, o); - if (PageAddItem(npage, PageGetItem(opage, ii), - ii->lp_len, n, LP_USED) == InvalidOffsetNumber) - elog(FATAL, "btree: failed to add item to the page in _bt_sort (1)"); -#ifdef FASTBUILD_DEBUG - { - bool isnull; - BTItem tmpbti = - (BTItem) PageGetItem(npage, PageGetItemId(npage, n)); - Datum d = index_getattr(&(tmpbti->bti_itup), 1, - index->rd_att, &isnull); - - printf("_bt_buildadd: moved <%x> to offset %d at level %d\n", - d, n, state->btps_level); - } #endif - } /* - * this loop is backward because PageIndexTupleDelete shuffles the - * tuples to fill holes in the page -- by starting at the end and - * working back, we won't create holes (and thereby avoid - * shuffling). + * Move 'last' into the high key position on opage */ - for (o = last_off; o > first_off; o = OffsetNumberPrev(o)) - PageIndexTupleDelete(opage, o); hii = PageGetItemId(opage, P_HIKEY); - ii = PageGetItemId(opage, first_off); *hii = *ii; ii->lp_flags &= ~LP_USED; ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); - first_off = P_FIRSTKEY; + /* + * Reset last_off to point to new page + */ last_off = PageGetMaxOffsetNumber(npage); - last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, last_off)); /* * set the page (side link) pointers. @@ -399,32 +383,21 @@ _bt_buildadd(Relation index, Size keysz, ScanKey scankey, oopaque->btpo_next = BufferGetBlockNumber(nbuf); nopaque->btpo_prev = BufferGetBlockNumber(obuf); nopaque->btpo_next = P_NONE; - - if (_bt_itemcmp(index, keysz, scankey, - (BTItem) PageGetItem(opage, PageGetItemId(opage, P_HIKEY)), - (BTItem) PageGetItem(opage, PageGetItemId(opage, P_FIRSTKEY)), - BTEqualStrategyNumber)) - oopaque->btpo_flags |= BTP_CHAIN; } /* - * copy the old buffer's minimum key to its parent. if we don't - * have a parent, we have to create one; this adds a new btree - * level. + * Link the old buffer into its parent, using its minimum key. + * If we don't have a parent, we have to create one; + * this adds a new btree level. */ - if (state->btps_doupper) + if (state->btps_next == (BTPageState *) NULL) { - BTItem nbti; - - if (state->btps_next == (BTPageState *) NULL) - { - state->btps_next = - _bt_pagestate(index, 0, state->btps_level + 1, true); - } - nbti = _bt_minitem(opage, BufferGetBlockNumber(obuf), 0); - _bt_buildadd(index, keysz, scankey, state->btps_next, nbti, 0); - pfree((void *) nbti); + state->btps_next = + _bt_pagestate(index, 0, state->btps_level + 1); } + nbti = _bt_minitem(opage, BufferGetBlockNumber(obuf), 0); + _bt_buildadd(index, state->btps_next, nbti, 0); + pfree((void *) nbti); /* * write out the old stuff. we never want to see it again, so we @@ -435,11 +408,11 @@ _bt_buildadd(Relation index, Size keysz, ScanKey scankey, } /* - * if this item is different from the last item added, we start a new - * chain of duplicates. + * Add the new item into the current page. */ - off = OffsetNumberNext(last_off); - if (PageAddItem(npage, (Item) bti, btisz, off, LP_USED) == InvalidOffsetNumber) + last_off = OffsetNumberNext(last_off); + if (PageAddItem(npage, (Item) bti, btisz, + last_off, LP_USED) == InvalidOffsetNumber) elog(FATAL, "btree: failed to add item to the page in _bt_sort (2)"); #ifdef FASTBUILD_DEBUG { @@ -447,65 +420,57 @@ _bt_buildadd(Relation index, Size keysz, ScanKey scankey, Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull); printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n", - d, off, state->btps_level); + d, last_off, state->btps_level); } #endif - if (last_bti == (BTItem) NULL) - first_off = P_FIRSTKEY; - else if (!_bt_itemcmp(index, keysz, scankey, - bti, last_bti, BTEqualStrategyNumber)) - first_off = off; - last_off = off; - last_bti = (BTItem) PageGetItem(npage, PageGetItemId(npage, off)); state->btps_buf = nbuf; state->btps_page = npage; - state->btps_lastbti = last_bti; state->btps_lastoff = last_off; - state->btps_firstoff = first_off; - - return last_bti; } +/* + * Finish writing out the completed btree. + */ static void -_bt_uppershutdown(Relation index, Size keysz, ScanKey scankey, - BTPageState *state) +_bt_uppershutdown(Relation index, BTPageState *state) { BTPageState *s; BlockNumber blkno; BTPageOpaque opaque; BTItem bti; + /* + * Each iteration of this loop completes one more level of the tree. + */ for (s = state; s != (BTPageState *) NULL; s = s->btps_next) { blkno = BufferGetBlockNumber(s->btps_buf); opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page); /* - * if this is the root, attach it to the metapage. otherwise, - * stick the minimum key of the last page on this level (which has - * not been split, or else it wouldn't be the last page) into its - * parent. this may cause the last page of upper levels to split, - * but that's not a problem -- we haven't gotten to them yet. + * We have to link the last page on this level to somewhere. + * + * If we're at the top, it's the root, so attach it to the metapage. + * Otherwise, add an entry for it to its parent using its minimum + * key. This may cause the last page of the parent level to split, + * but that's not a problem -- we haven't gotten to it yet. */ - if (s->btps_doupper) + if (s->btps_next == (BTPageState *) NULL) { - if (s->btps_next == (BTPageState *) NULL) - { - opaque->btpo_flags |= BTP_ROOT; - _bt_metaproot(index, blkno, s->btps_level + 1); - } - else - { - bti = _bt_minitem(s->btps_page, blkno, 0); - _bt_buildadd(index, keysz, scankey, s->btps_next, bti, 0); - pfree((void *) bti); - } + opaque->btpo_flags |= BTP_ROOT; + _bt_metaproot(index, blkno, s->btps_level + 1); + } + else + { + bti = _bt_minitem(s->btps_page, blkno, 0); + _bt_buildadd(index, s->btps_next, bti, 0); + pfree((void *) bti); } /* - * this is the rightmost page, so the ItemId array needs to be - * slid back one slot. + * This is the rightmost page, so the ItemId array needs to be + * slid back one slot. Then we can dump out the page. */ _bt_slideleft(index, s->btps_buf, s->btps_page); _bt_wrtbuf(index, s->btps_buf); @@ -519,32 +484,27 @@ _bt_uppershutdown(Relation index, Size keysz, ScanKey scankey, static void _bt_load(Relation index, BTSpool *btspool) { - BTPageState *state; - ScanKey skey; - int natts; - BTItem bti; - bool should_free; - - /* - * initialize state needed for the merge into the btree leaf pages. - */ - state = _bt_pagestate(index, BTP_LEAF, 0, true); - - skey = _bt_mkscankey_nodata(index); - natts = RelationGetNumberOfAttributes(index); + BTPageState *state = NULL; for (;;) { + BTItem bti; + bool should_free; + bti = (BTItem) tuplesort_getindextuple(btspool->sortstate, true, &should_free); if (bti == (BTItem) NULL) break; - _bt_buildadd(index, natts, skey, state, bti, BTP_LEAF); + + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(index, BTP_LEAF, 0); + + _bt_buildadd(index, state, bti, BTP_LEAF); if (should_free) pfree((void *) bti); } - _bt_uppershutdown(index, natts, skey, state); - - _bt_freeskey(skey); + if (state != NULL) + _bt_uppershutdown(index, state); } |