aboutsummaryrefslogtreecommitdiff
path: root/src/backend/access/nbtree/nbtinsert.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/nbtree/nbtinsert.c')
-rw-r--r--src/backend/access/nbtree/nbtinsert.c268
1 files changed, 132 insertions, 136 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index c73ba358ec1..33c7612aac5 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.126 2005/10/12 17:18:03 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.127 2005/10/15 02:49:09 momjian Exp $
*
*-------------------------------------------------------------------------
*/
@@ -93,30 +93,29 @@ top:
/*
* If the page was split between the time that we surrendered our read
- * lock and acquired our write lock, then this page may no longer be
- * the right place for the key we want to insert. In this case, we
- * need to move right in the tree. See Lehman and Yao for an
- * excruciatingly precise description.
+ * lock and acquired our write lock, then this page may no longer be the
+ * right place for the key we want to insert. In this case, we need to
+ * move right in the tree. See Lehman and Yao for an excruciatingly
+ * precise description.
*/
buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE);
/*
- * If we're not allowing duplicates, make sure the key isn't already
- * in the index.
+ * If we're not allowing duplicates, make sure the key isn't already in
+ * the index.
*
- * NOTE: obviously, _bt_check_unique can only detect keys that are
- * already in the index; so it cannot defend against concurrent
- * insertions of the same key. We protect against that by means of
- * holding a write lock on the target page. Any other would-be
- * inserter of the same key must acquire a write lock on the same
- * target page, so only one would-be inserter can be making the check
- * at one time. Furthermore, once we are past the check we hold write
- * locks continuously until we have performed our insertion, so no
- * later inserter can fail to see our insertion. (This requires some
- * care in _bt_insertonpg.)
+ * NOTE: obviously, _bt_check_unique can only detect keys that are already in
+ * the index; so it cannot defend against concurrent insertions of the
+ * same key. We protect against that by means of holding a write lock on
+ * the target page. Any other would-be inserter of the same key must
+ * acquire a write lock on the same target page, so only one would-be
+ * inserter can be making the check at one time. Furthermore, once we are
+ * past the check we hold write locks continuously until we have performed
+ * our insertion, so no later inserter can fail to see our insertion.
+ * (This requires some care in _bt_insertonpg.)
*
- * If we must wait for another xact, we release the lock while waiting,
- * and then must start over completely.
+ * If we must wait for another xact, we release the lock while waiting, and
+ * then must start over completely.
*/
if (index_is_unique)
{
@@ -167,8 +166,8 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
maxoff = PageGetMaxOffsetNumber(page);
/*
- * Find first item >= proposed new item. Note we could also get a
- * pointer to end-of-page here.
+ * Find first item >= proposed new item. Note we could also get a pointer
+ * to end-of-page here.
*/
offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
@@ -194,24 +193,24 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
/*
* We can skip items that are marked killed.
*
- * Formerly, we applied _bt_isequal() before checking the kill
- * flag, so as to fall out of the item loop as soon as
- * possible. However, in the presence of heavy update activity
- * an index may contain many killed items with the same key;
- * running _bt_isequal() on each killed item gets expensive.
- * Furthermore it is likely that the non-killed version of
- * each key appears first, so that we didn't actually get to
- * exit any sooner anyway. So now we just advance over killed
- * items as quickly as we can. We only apply _bt_isequal()
- * when we get to a non-killed item or the end of the page.
+ * Formerly, we applied _bt_isequal() before checking the kill flag,
+ * so as to fall out of the item loop as soon as possible.
+ * However, in the presence of heavy update activity an index may
+ * contain many killed items with the same key; running
+ * _bt_isequal() on each killed item gets expensive. Furthermore
+ * it is likely that the non-killed version of each key appears
+ * first, so that we didn't actually get to exit any sooner
+ * anyway. So now we just advance over killed items as quickly as
+ * we can. We only apply _bt_isequal() when we get to a non-killed
+ * item or the end of the page.
*/
if (!ItemIdDeleted(curitemid))
{
/*
- * _bt_compare returns 0 for (1,NULL) and (1,NULL) -
- * this's how we handling NULLs - and so we must not use
- * _bt_compare in real comparison, but only for
- * ordering/finding items on pages. - vadim 03/24/97
+ * _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
+ * how we handling NULLs - and so we must not use _bt_compare
+ * in real comparison, but only for ordering/finding items on
+ * pages. - vadim 03/24/97
*/
if (!_bt_isequal(itupdesc, page, offset, natts, itup_scankey))
break; /* we're past all the equal tuples */
@@ -246,15 +245,15 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
*/
ereport(ERROR,
(errcode(ERRCODE_UNIQUE_VIOLATION),
- errmsg("duplicate key violates unique constraint \"%s\"",
- RelationGetRelationName(rel))));
+ errmsg("duplicate key violates unique constraint \"%s\"",
+ RelationGetRelationName(rel))));
}
else if (htup.t_data != NULL)
{
/*
- * Hmm, if we can't see the tuple, maybe it can be
- * marked killed. This logic should match
- * index_getnext and btgettuple.
+ * Hmm, if we can't see the tuple, maybe it can be marked
+ * killed. This logic should match index_getnext and
+ * btgettuple.
*/
LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
if (HeapTupleSatisfiesVacuum(htup.t_data, RecentGlobalXmin,
@@ -377,15 +376,15 @@ _bt_insertonpg(Relation rel,
itemsz = IndexTupleDSize(btitem->bti_itup)
+ (sizeof(BTItemData) - sizeof(IndexTupleData));
- itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but
- * we need to be consistent */
+ itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
+ * need to be consistent */
/*
- * Check whether the item can fit on a btree page at all. (Eventually,
- * we ought to try to apply TOAST methods if not.) We actually need to
- * be able to fit three items on every page, so restrict any one item
- * to 1/3 the per-page available space. Note that at this point,
- * itemsz doesn't include the ItemId.
+ * Check whether the item can fit on a btree page at all. (Eventually, we
+ * ought to try to apply TOAST methods if not.) We actually need to be
+ * able to fit three items on every page, so restrict any one item to 1/3
+ * the per-page available space. Note that at this point, itemsz doesn't
+ * include the ItemId.
*/
if (itemsz > BTMaxItemSize(page))
ereport(ERROR,
@@ -393,9 +392,9 @@ _bt_insertonpg(Relation rel,
errmsg("index row size %lu exceeds btree maximum, %lu",
(unsigned long) itemsz,
(unsigned long) BTMaxItemSize(page)),
- errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
- "Consider a function index of an MD5 hash of the value, "
- "or use full text indexing.")));
+ errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
+ "Consider a function index of an MD5 hash of the value, "
+ "or use full text indexing.")));
/*
* Determine exactly where new item will go.
@@ -432,11 +431,11 @@ _bt_insertonpg(Relation rel,
/*
* step right to next non-dead page
*
- * must write-lock that page before releasing write lock on
- * current page; else someone else's _bt_check_unique scan
- * could fail to see our insertion. write locks on
- * intermediate dead pages won't do because we don't know when
- * they will get de-linked from the tree.
+ * must write-lock that page before releasing write lock on current
+ * page; else someone else's _bt_check_unique scan could fail to
+ * see our insertion. write locks on intermediate dead pages
+ * won't do because we don't know when they will get de-linked
+ * from the tree.
*/
Buffer rbuf = InvalidBuffer;
@@ -459,9 +458,9 @@ _bt_insertonpg(Relation rel,
}
/*
- * Now we are on the right page, so find the insert position. If
- * we moved right at all, we know we should insert at the start of
- * the page, else must find the position by searching.
+ * Now we are on the right page, so find the insert position. If we
+ * moved right at all, we know we should insert at the start of the
+ * page, else must find the position by searching.
*/
if (movedright)
newitemoff = P_FIRSTDATAKEY(lpageop);
@@ -472,9 +471,9 @@ _bt_insertonpg(Relation rel,
/*
* Do we need to split the page to fit the item on it?
*
- * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
- * so this comparison is correct even though we appear to be
- * accounting only for the item and not for its line pointer.
+ * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, so
+ * this comparison is correct even though we appear to be accounting only
+ * for the item and not for its line pointer.
*/
if (PageGetFreeSpace(page) < itemsz)
{
@@ -522,12 +521,11 @@ _bt_insertonpg(Relation rel,
itup_blkno = BufferGetBlockNumber(buf);
/*
- * If we are doing this insert because we split a page that was
- * the only one on its tree level, but was not the root, it may
- * have been the "fast root". We need to ensure that the fast
- * root link points at or above the current page. We can safely
- * acquire a lock on the metapage here --- see comments for
- * _bt_newroot().
+ * If we are doing this insert because we split a page that was the
+ * only one on its tree level, but was not the root, it may have been
+ * the "fast root". We need to ensure that the fast root link points
+ * at or above the current page. We can safely acquire a lock on the
+ * metapage here --- see comments for _bt_newroot().
*/
if (split_only_page)
{
@@ -692,11 +690,11 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
/*
- * If the page we're splitting is not the rightmost page at its level
- * in the tree, then the first entry on the page is the high key for
- * the page. We need to copy that to the right half. Otherwise
- * (meaning the rightmost page case), all the items on the right half
- * will be user data.
+ * If the page we're splitting is not the rightmost page at its level in
+ * the tree, then the first entry on the page is the high key for the
+ * page. We need to copy that to the right half. Otherwise (meaning the
+ * rightmost page case), all the items on the right half will be user
+ * data.
*/
rightoff = P_HIKEY;
@@ -712,9 +710,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
}
/*
- * The "high key" for the new left page will be the first key that's
- * going to go into the new right page. This might be either the
- * existing data item at position firstright, or the incoming tuple.
+ * The "high key" for the new left page will be the first key that's going
+ * to go into the new right page. This might be either the existing data
+ * item at position firstright, or the incoming tuple.
*/
leftoff = P_HIKEY;
if (!newitemonleft && newitemoff == firstright)
@@ -806,8 +804,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
/*
* We have to grab the right sibling (if any) and fix the prev pointer
* there. We are guaranteed that this is deadlock-free since no other
- * writer will be holding a lock on that page and trying to move left,
- * and all readers release locks on a page before trying to fetch its
+ * writer will be holding a lock on that page and trying to move left, and
+ * all readers release locks on a page before trying to fetch its
* neighbors.
*/
@@ -821,8 +819,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
}
/*
- * Right sibling is locked, new siblings are prepared, but original
- * page is not updated yet. Log changes before continuing.
+ * Right sibling is locked, new siblings are prepared, but original page
+ * is not updated yet. Log changes before continuing.
*
* NO EREPORT(ERROR) till right sibling is updated.
*/
@@ -850,10 +848,10 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
xlrec.level = lopaque->btpo.level;
/*
- * Direct access to page is not good but faster - we should
- * implement some new func in page API. Note we only store the
- * tuples themselves, knowing that the item pointers are in the
- * same order and can be reconstructed by scanning the tuples.
+ * Direct access to page is not good but faster - we should implement
+ * some new func in page API. Note we only store the tuples
+ * themselves, knowing that the item pointers are in the same order
+ * and can be reconstructed by scanning the tuples.
*/
xlrec.leftlen = ((PageHeader) leftpage)->pd_special -
((PageHeader) leftpage)->pd_upper;
@@ -903,13 +901,13 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
}
/*
- * By here, the original data page has been split into two new halves,
- * and these are correct. The algorithm requires that the left page
- * never move during a split, so we copy the new left page back on top
- * of the original. Note that this is not a waste of time, since we
- * also require (in the page management code) that the center of a
- * page always be clean, and the most efficient way to guarantee this
- * is just to compact the data by reinserting it into a new left page.
+ * By here, the original data page has been split into two new halves, and
+ * these are correct. The algorithm requires that the left page never
+ * move during a split, so we copy the new left page back on top of the
+ * original. Note that this is not a waste of time, since we also require
+ * (in the page management code) that the center of a page always be
+ * clean, and the most efficient way to guarantee this is just to compact
+ * the data by reinserting it into a new left page.
*/
PageRestoreTempPage(leftpage, origpage);
@@ -984,13 +982,13 @@ _bt_findsplitloc(Relation rel,
MAXALIGN(sizeof(BTPageOpaqueData));
/*
- * Finding the best possible split would require checking all the
- * possible split points, because of the high-key and left-key special
- * cases. That's probably more work than it's worth; instead, stop as
- * soon as we find a "good-enough" split, where good-enough is defined
- * as an imbalance in free space of no more than pagesize/16
- * (arbitrary...) This should let us stop near the middle on most
- * pages, instead of plowing to the end.
+ * Finding the best possible split would require checking all the possible
+ * split points, because of the high-key and left-key special cases.
+ * That's probably more work than it's worth; instead, stop as soon as we
+ * find a "good-enough" split, where good-enough is defined as an
+ * imbalance in free space of no more than pagesize/16 (arbitrary...) This
+ * should let us stop near the middle on most pages, instead of plowing to
+ * the end.
*/
goodenough = leftspace / 16;
@@ -1006,8 +1004,8 @@ _bt_findsplitloc(Relation rel,
dataitemtotal = rightspace - (int) PageGetFreeSpace(page);
/*
- * Scan through the data items and calculate space usage for a split
- * at each possible position.
+ * Scan through the data items and calculate space usage for a split at
+ * each possible position.
*/
dataitemstoleft = 0;
maxoff = PageGetMaxOffsetNumber(page);
@@ -1024,9 +1022,9 @@ _bt_findsplitloc(Relation rel,
itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
/*
- * We have to allow for the current item becoming the high key of
- * the left page; therefore it counts against left space as well
- * as right space.
+ * We have to allow for the current item becoming the high key of the
+ * left page; therefore it counts against left space as well as right
+ * space.
*/
leftfree = leftspace - dataitemstoleft - (int) itemsz;
rightfree = rightspace - (dataitemtotal - dataitemstoleft);
@@ -1058,8 +1056,8 @@ _bt_findsplitloc(Relation rel,
}
/*
- * I believe it is not possible to fail to find a feasible split, but
- * just in case ...
+ * I believe it is not possible to fail to find a feasible split, but just
+ * in case ...
*/
if (!state.have_split)
elog(ERROR, "could not find a feasible split point for \"%s\"",
@@ -1105,8 +1103,7 @@ _bt_checksplitloc(FindSplitData *state, OffsetNumber firstright,
{
/*
* On a rightmost page, try to equalize right free space with
- * twice the left free space. See comments for
- * _bt_findsplitloc.
+ * twice the left free space. See comments for _bt_findsplitloc.
*/
delta = (2 * leftfree) - rightfree;
}
@@ -1153,19 +1150,18 @@ _bt_insert_parent(Relation rel,
bool is_only)
{
/*
- * Here we have to do something Lehman and Yao don't talk about: deal
- * with a root split and construction of a new root. If our stack is
- * empty then we have just split a node on what had been the root
- * level when we descended the tree. If it was still the root then we
- * perform a new-root construction. If it *wasn't* the root anymore,
- * search to find the next higher level that someone constructed
- * meanwhile, and find the right place to insert as for the normal
- * case.
+ * Here we have to do something Lehman and Yao don't talk about: deal with
+ * a root split and construction of a new root. If our stack is empty
+ * then we have just split a node on what had been the root level when we
+ * descended the tree. If it was still the root then we perform a
+ * new-root construction. If it *wasn't* the root anymore, search to find
+ * the next higher level that someone constructed meanwhile, and find the
+ * right place to insert as for the normal case.
*
- * If we have to search for the parent level, we do so by re-descending
- * from the root. This is not super-efficient, but it's rare enough
- * not to matter. (This path is also taken when called from WAL
- * recovery --- we have no stack in that case.)
+ * If we have to search for the parent level, we do so by re-descending from
+ * the root. This is not super-efficient, but it's rare enough not to
+ * matter. (This path is also taken when called from WAL recovery --- we
+ * have no stack in that case.)
*/
if (is_root)
{
@@ -1219,9 +1215,9 @@ _bt_insert_parent(Relation rel,
/*
* Find the parent buffer and get the parent page.
*
- * Oops - if we were moved right then we need to change stack item!
- * We want to find parent pointing to where we are, right ? -
- * vadim 05/27/97
+ * Oops - if we were moved right then we need to change stack item! We
+ * want to find parent pointing to where we are, right ? - vadim
+ * 05/27/97
*/
ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid),
bknum, P_HIKEY);
@@ -1291,9 +1287,9 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access)
maxoff = PageGetMaxOffsetNumber(page);
/*
- * start = InvalidOffsetNumber means "search the whole page".
- * We need this test anyway due to possibility that page has a
- * high key now when it didn't before.
+ * start = InvalidOffsetNumber means "search the whole page". We
+ * need this test anyway due to possibility that page has a high
+ * key now when it didn't before.
*/
if (start < minoff)
start = minoff;
@@ -1307,8 +1303,8 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access)
/*
* These loops will check every item on the page --- but in an
- * order that's attuned to the probability of where it
- * actually is. Scan to the right first, then to the left.
+ * order that's attuned to the probability of where it actually
+ * is. Scan to the right first, then to the left.
*/
for (offnum = start;
offnum <= maxoff;
@@ -1424,9 +1420,9 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
metad->btm_fastlevel = rootopaque->btpo.level;
/*
- * Create downlink item for left page (old root). Since this will be
- * the first item in a non-leaf page, it implicitly has minus-infinity
- * key value, so we need not store any actual key in it.
+ * Create downlink item for left page (old root). Since this will be the
+ * first item in a non-leaf page, it implicitly has minus-infinity key
+ * value, so we need not store any actual key in it.
*/
itemsz = sizeof(BTItemData);
new_item = (BTItem) palloc(itemsz);
@@ -1434,17 +1430,17 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_HIKEY);
/*
- * Insert the left page pointer into the new root page. The root page
- * is the rightmost page on its level so there is no "high key" in it;
- * the two items will go into positions P_HIKEY and P_FIRSTKEY.
+ * Insert the left page pointer into the new root page. The root page is
+ * the rightmost page on its level so there is no "high key" in it; the
+ * two items will go into positions P_HIKEY and P_FIRSTKEY.
*/
if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber)
elog(PANIC, "failed to add leftkey to new root page");
pfree(new_item);
/*
- * Create downlink item for right page. The key for it is obtained
- * from the "high key" position in the left page.
+ * Create downlink item for right page. The key for it is obtained from
+ * the "high key" position in the left page.
*/
itemid = PageGetItemId(lpage, P_HIKEY);
itemsz = ItemIdGetLength(itemid);
@@ -1476,8 +1472,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
rdata[0].next = &(rdata[1]);
/*
- * Direct access to page is not good but faster - we should
- * implement some new func in page API.
+ * Direct access to page is not good but faster - we should implement
+ * some new func in page API.
*/
rdata[1].data = (char *) rootpage + ((PageHeader) rootpage)->pd_upper;
rdata[1].len = ((PageHeader) rootpage)->pd_special -