diff options
Diffstat (limited to 'src/backend/access/nbtree/nbtinsert.c')
-rw-r--r-- | src/backend/access/nbtree/nbtinsert.c | 268 |
1 files changed, 132 insertions, 136 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index c73ba358ec1..33c7612aac5 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.126 2005/10/12 17:18:03 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.127 2005/10/15 02:49:09 momjian Exp $ * *------------------------------------------------------------------------- */ @@ -93,30 +93,29 @@ top: /* * If the page was split between the time that we surrendered our read - * lock and acquired our write lock, then this page may no longer be - * the right place for the key we want to insert. In this case, we - * need to move right in the tree. See Lehman and Yao for an - * excruciatingly precise description. + * lock and acquired our write lock, then this page may no longer be the + * right place for the key we want to insert. In this case, we need to + * move right in the tree. See Lehman and Yao for an excruciatingly + * precise description. */ buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE); /* - * If we're not allowing duplicates, make sure the key isn't already - * in the index. + * If we're not allowing duplicates, make sure the key isn't already in + * the index. * - * NOTE: obviously, _bt_check_unique can only detect keys that are - * already in the index; so it cannot defend against concurrent - * insertions of the same key. We protect against that by means of - * holding a write lock on the target page. Any other would-be - * inserter of the same key must acquire a write lock on the same - * target page, so only one would-be inserter can be making the check - * at one time. Furthermore, once we are past the check we hold write - * locks continuously until we have performed our insertion, so no - * later inserter can fail to see our insertion. (This requires some - * care in _bt_insertonpg.) + * NOTE: obviously, _bt_check_unique can only detect keys that are already in + * the index; so it cannot defend against concurrent insertions of the + * same key. We protect against that by means of holding a write lock on + * the target page. Any other would-be inserter of the same key must + * acquire a write lock on the same target page, so only one would-be + * inserter can be making the check at one time. Furthermore, once we are + * past the check we hold write locks continuously until we have performed + * our insertion, so no later inserter can fail to see our insertion. + * (This requires some care in _bt_insertonpg.) * - * If we must wait for another xact, we release the lock while waiting, - * and then must start over completely. + * If we must wait for another xact, we release the lock while waiting, and + * then must start over completely. */ if (index_is_unique) { @@ -167,8 +166,8 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel, maxoff = PageGetMaxOffsetNumber(page); /* - * Find first item >= proposed new item. Note we could also get a - * pointer to end-of-page here. + * Find first item >= proposed new item. Note we could also get a pointer + * to end-of-page here. */ offset = _bt_binsrch(rel, buf, natts, itup_scankey, false); @@ -194,24 +193,24 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel, /* * We can skip items that are marked killed. * - * Formerly, we applied _bt_isequal() before checking the kill - * flag, so as to fall out of the item loop as soon as - * possible. However, in the presence of heavy update activity - * an index may contain many killed items with the same key; - * running _bt_isequal() on each killed item gets expensive. - * Furthermore it is likely that the non-killed version of - * each key appears first, so that we didn't actually get to - * exit any sooner anyway. So now we just advance over killed - * items as quickly as we can. We only apply _bt_isequal() - * when we get to a non-killed item or the end of the page. + * Formerly, we applied _bt_isequal() before checking the kill flag, + * so as to fall out of the item loop as soon as possible. + * However, in the presence of heavy update activity an index may + * contain many killed items with the same key; running + * _bt_isequal() on each killed item gets expensive. Furthermore + * it is likely that the non-killed version of each key appears + * first, so that we didn't actually get to exit any sooner + * anyway. So now we just advance over killed items as quickly as + * we can. We only apply _bt_isequal() when we get to a non-killed + * item or the end of the page. */ if (!ItemIdDeleted(curitemid)) { /* - * _bt_compare returns 0 for (1,NULL) and (1,NULL) - - * this's how we handling NULLs - and so we must not use - * _bt_compare in real comparison, but only for - * ordering/finding items on pages. - vadim 03/24/97 + * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's + * how we handling NULLs - and so we must not use _bt_compare + * in real comparison, but only for ordering/finding items on + * pages. - vadim 03/24/97 */ if (!_bt_isequal(itupdesc, page, offset, natts, itup_scankey)) break; /* we're past all the equal tuples */ @@ -246,15 +245,15 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel, */ ereport(ERROR, (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("duplicate key violates unique constraint \"%s\"", - RelationGetRelationName(rel)))); + errmsg("duplicate key violates unique constraint \"%s\"", + RelationGetRelationName(rel)))); } else if (htup.t_data != NULL) { /* - * Hmm, if we can't see the tuple, maybe it can be - * marked killed. This logic should match - * index_getnext and btgettuple. + * Hmm, if we can't see the tuple, maybe it can be marked + * killed. This logic should match index_getnext and + * btgettuple. */ LockBuffer(hbuffer, BUFFER_LOCK_SHARE); if (HeapTupleSatisfiesVacuum(htup.t_data, RecentGlobalXmin, @@ -377,15 +376,15 @@ _bt_insertonpg(Relation rel, itemsz = IndexTupleDSize(btitem->bti_itup) + (sizeof(BTItemData) - sizeof(IndexTupleData)); - itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but - * we need to be consistent */ + itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we + * need to be consistent */ /* - * Check whether the item can fit on a btree page at all. (Eventually, - * we ought to try to apply TOAST methods if not.) We actually need to - * be able to fit three items on every page, so restrict any one item - * to 1/3 the per-page available space. Note that at this point, - * itemsz doesn't include the ItemId. + * Check whether the item can fit on a btree page at all. (Eventually, we + * ought to try to apply TOAST methods if not.) We actually need to be + * able to fit three items on every page, so restrict any one item to 1/3 + * the per-page available space. Note that at this point, itemsz doesn't + * include the ItemId. */ if (itemsz > BTMaxItemSize(page)) ereport(ERROR, @@ -393,9 +392,9 @@ _bt_insertonpg(Relation rel, errmsg("index row size %lu exceeds btree maximum, %lu", (unsigned long) itemsz, (unsigned long) BTMaxItemSize(page)), - errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" - "Consider a function index of an MD5 hash of the value, " - "or use full text indexing."))); + errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" + "Consider a function index of an MD5 hash of the value, " + "or use full text indexing."))); /* * Determine exactly where new item will go. @@ -432,11 +431,11 @@ _bt_insertonpg(Relation rel, /* * step right to next non-dead page * - * must write-lock that page before releasing write lock on - * current page; else someone else's _bt_check_unique scan - * could fail to see our insertion. write locks on - * intermediate dead pages won't do because we don't know when - * they will get de-linked from the tree. + * must write-lock that page before releasing write lock on current + * page; else someone else's _bt_check_unique scan could fail to + * see our insertion. write locks on intermediate dead pages + * won't do because we don't know when they will get de-linked + * from the tree. */ Buffer rbuf = InvalidBuffer; @@ -459,9 +458,9 @@ _bt_insertonpg(Relation rel, } /* - * Now we are on the right page, so find the insert position. If - * we moved right at all, we know we should insert at the start of - * the page, else must find the position by searching. + * Now we are on the right page, so find the insert position. If we + * moved right at all, we know we should insert at the start of the + * page, else must find the position by searching. */ if (movedright) newitemoff = P_FIRSTDATAKEY(lpageop); @@ -472,9 +471,9 @@ _bt_insertonpg(Relation rel, /* * Do we need to split the page to fit the item on it? * - * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, - * so this comparison is correct even though we appear to be - * accounting only for the item and not for its line pointer. + * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, so + * this comparison is correct even though we appear to be accounting only + * for the item and not for its line pointer. */ if (PageGetFreeSpace(page) < itemsz) { @@ -522,12 +521,11 @@ _bt_insertonpg(Relation rel, itup_blkno = BufferGetBlockNumber(buf); /* - * If we are doing this insert because we split a page that was - * the only one on its tree level, but was not the root, it may - * have been the "fast root". We need to ensure that the fast - * root link points at or above the current page. We can safely - * acquire a lock on the metapage here --- see comments for - * _bt_newroot(). + * If we are doing this insert because we split a page that was the + * only one on its tree level, but was not the root, it may have been + * the "fast root". We need to ensure that the fast root link points + * at or above the current page. We can safely acquire a lock on the + * metapage here --- see comments for _bt_newroot(). */ if (split_only_page) { @@ -692,11 +690,11 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level; /* - * If the page we're splitting is not the rightmost page at its level - * in the tree, then the first entry on the page is the high key for - * the page. We need to copy that to the right half. Otherwise - * (meaning the rightmost page case), all the items on the right half - * will be user data. + * If the page we're splitting is not the rightmost page at its level in + * the tree, then the first entry on the page is the high key for the + * page. We need to copy that to the right half. Otherwise (meaning the + * rightmost page case), all the items on the right half will be user + * data. */ rightoff = P_HIKEY; @@ -712,9 +710,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, } /* - * The "high key" for the new left page will be the first key that's - * going to go into the new right page. This might be either the - * existing data item at position firstright, or the incoming tuple. + * The "high key" for the new left page will be the first key that's going + * to go into the new right page. This might be either the existing data + * item at position firstright, or the incoming tuple. */ leftoff = P_HIKEY; if (!newitemonleft && newitemoff == firstright) @@ -806,8 +804,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, /* * We have to grab the right sibling (if any) and fix the prev pointer * there. We are guaranteed that this is deadlock-free since no other - * writer will be holding a lock on that page and trying to move left, - * and all readers release locks on a page before trying to fetch its + * writer will be holding a lock on that page and trying to move left, and + * all readers release locks on a page before trying to fetch its * neighbors. */ @@ -821,8 +819,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, } /* - * Right sibling is locked, new siblings are prepared, but original - * page is not updated yet. Log changes before continuing. + * Right sibling is locked, new siblings are prepared, but original page + * is not updated yet. Log changes before continuing. * * NO EREPORT(ERROR) till right sibling is updated. */ @@ -850,10 +848,10 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, xlrec.level = lopaque->btpo.level; /* - * Direct access to page is not good but faster - we should - * implement some new func in page API. Note we only store the - * tuples themselves, knowing that the item pointers are in the - * same order and can be reconstructed by scanning the tuples. + * Direct access to page is not good but faster - we should implement + * some new func in page API. Note we only store the tuples + * themselves, knowing that the item pointers are in the same order + * and can be reconstructed by scanning the tuples. */ xlrec.leftlen = ((PageHeader) leftpage)->pd_special - ((PageHeader) leftpage)->pd_upper; @@ -903,13 +901,13 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, } /* - * By here, the original data page has been split into two new halves, - * and these are correct. The algorithm requires that the left page - * never move during a split, so we copy the new left page back on top - * of the original. Note that this is not a waste of time, since we - * also require (in the page management code) that the center of a - * page always be clean, and the most efficient way to guarantee this - * is just to compact the data by reinserting it into a new left page. + * By here, the original data page has been split into two new halves, and + * these are correct. The algorithm requires that the left page never + * move during a split, so we copy the new left page back on top of the + * original. Note that this is not a waste of time, since we also require + * (in the page management code) that the center of a page always be + * clean, and the most efficient way to guarantee this is just to compact + * the data by reinserting it into a new left page. */ PageRestoreTempPage(leftpage, origpage); @@ -984,13 +982,13 @@ _bt_findsplitloc(Relation rel, MAXALIGN(sizeof(BTPageOpaqueData)); /* - * Finding the best possible split would require checking all the - * possible split points, because of the high-key and left-key special - * cases. That's probably more work than it's worth; instead, stop as - * soon as we find a "good-enough" split, where good-enough is defined - * as an imbalance in free space of no more than pagesize/16 - * (arbitrary...) This should let us stop near the middle on most - * pages, instead of plowing to the end. + * Finding the best possible split would require checking all the possible + * split points, because of the high-key and left-key special cases. + * That's probably more work than it's worth; instead, stop as soon as we + * find a "good-enough" split, where good-enough is defined as an + * imbalance in free space of no more than pagesize/16 (arbitrary...) This + * should let us stop near the middle on most pages, instead of plowing to + * the end. */ goodenough = leftspace / 16; @@ -1006,8 +1004,8 @@ _bt_findsplitloc(Relation rel, dataitemtotal = rightspace - (int) PageGetFreeSpace(page); /* - * Scan through the data items and calculate space usage for a split - * at each possible position. + * Scan through the data items and calculate space usage for a split at + * each possible position. */ dataitemstoleft = 0; maxoff = PageGetMaxOffsetNumber(page); @@ -1024,9 +1022,9 @@ _bt_findsplitloc(Relation rel, itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); /* - * We have to allow for the current item becoming the high key of - * the left page; therefore it counts against left space as well - * as right space. + * We have to allow for the current item becoming the high key of the + * left page; therefore it counts against left space as well as right + * space. */ leftfree = leftspace - dataitemstoleft - (int) itemsz; rightfree = rightspace - (dataitemtotal - dataitemstoleft); @@ -1058,8 +1056,8 @@ _bt_findsplitloc(Relation rel, } /* - * I believe it is not possible to fail to find a feasible split, but - * just in case ... + * I believe it is not possible to fail to find a feasible split, but just + * in case ... */ if (!state.have_split) elog(ERROR, "could not find a feasible split point for \"%s\"", @@ -1105,8 +1103,7 @@ _bt_checksplitloc(FindSplitData *state, OffsetNumber firstright, { /* * On a rightmost page, try to equalize right free space with - * twice the left free space. See comments for - * _bt_findsplitloc. + * twice the left free space. See comments for _bt_findsplitloc. */ delta = (2 * leftfree) - rightfree; } @@ -1153,19 +1150,18 @@ _bt_insert_parent(Relation rel, bool is_only) { /* - * Here we have to do something Lehman and Yao don't talk about: deal - * with a root split and construction of a new root. If our stack is - * empty then we have just split a node on what had been the root - * level when we descended the tree. If it was still the root then we - * perform a new-root construction. If it *wasn't* the root anymore, - * search to find the next higher level that someone constructed - * meanwhile, and find the right place to insert as for the normal - * case. + * Here we have to do something Lehman and Yao don't talk about: deal with + * a root split and construction of a new root. If our stack is empty + * then we have just split a node on what had been the root level when we + * descended the tree. If it was still the root then we perform a + * new-root construction. If it *wasn't* the root anymore, search to find + * the next higher level that someone constructed meanwhile, and find the + * right place to insert as for the normal case. * - * If we have to search for the parent level, we do so by re-descending - * from the root. This is not super-efficient, but it's rare enough - * not to matter. (This path is also taken when called from WAL - * recovery --- we have no stack in that case.) + * If we have to search for the parent level, we do so by re-descending from + * the root. This is not super-efficient, but it's rare enough not to + * matter. (This path is also taken when called from WAL recovery --- we + * have no stack in that case.) */ if (is_root) { @@ -1219,9 +1215,9 @@ _bt_insert_parent(Relation rel, /* * Find the parent buffer and get the parent page. * - * Oops - if we were moved right then we need to change stack item! - * We want to find parent pointing to where we are, right ? - - * vadim 05/27/97 + * Oops - if we were moved right then we need to change stack item! We + * want to find parent pointing to where we are, right ? - vadim + * 05/27/97 */ ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid), bknum, P_HIKEY); @@ -1291,9 +1287,9 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access) maxoff = PageGetMaxOffsetNumber(page); /* - * start = InvalidOffsetNumber means "search the whole page". - * We need this test anyway due to possibility that page has a - * high key now when it didn't before. + * start = InvalidOffsetNumber means "search the whole page". We + * need this test anyway due to possibility that page has a high + * key now when it didn't before. */ if (start < minoff) start = minoff; @@ -1307,8 +1303,8 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access) /* * These loops will check every item on the page --- but in an - * order that's attuned to the probability of where it - * actually is. Scan to the right first, then to the left. + * order that's attuned to the probability of where it actually + * is. Scan to the right first, then to the left. */ for (offnum = start; offnum <= maxoff; @@ -1424,9 +1420,9 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) metad->btm_fastlevel = rootopaque->btpo.level; /* - * Create downlink item for left page (old root). Since this will be - * the first item in a non-leaf page, it implicitly has minus-infinity - * key value, so we need not store any actual key in it. + * Create downlink item for left page (old root). Since this will be the + * first item in a non-leaf page, it implicitly has minus-infinity key + * value, so we need not store any actual key in it. */ itemsz = sizeof(BTItemData); new_item = (BTItem) palloc(itemsz); @@ -1434,17 +1430,17 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_HIKEY); /* - * Insert the left page pointer into the new root page. The root page - * is the rightmost page on its level so there is no "high key" in it; - * the two items will go into positions P_HIKEY and P_FIRSTKEY. + * Insert the left page pointer into the new root page. The root page is + * the rightmost page on its level so there is no "high key" in it; the + * two items will go into positions P_HIKEY and P_FIRSTKEY. */ if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber) elog(PANIC, "failed to add leftkey to new root page"); pfree(new_item); /* - * Create downlink item for right page. The key for it is obtained - * from the "high key" position in the left page. + * Create downlink item for right page. The key for it is obtained from + * the "high key" position in the left page. */ itemid = PageGetItemId(lpage, P_HIKEY); itemsz = ItemIdGetLength(itemid); @@ -1476,8 +1472,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) rdata[0].next = &(rdata[1]); /* - * Direct access to page is not good but faster - we should - * implement some new func in page API. + * Direct access to page is not good but faster - we should implement + * some new func in page API. */ rdata[1].data = (char *) rootpage + ((PageHeader) rootpage)->pd_upper; rdata[1].len = ((PageHeader) rootpage)->pd_special - |