diff options
author | Andrew Dunstan <andrew@dunslane.net> | 2018-03-26 22:39:24 +1030 |
---|---|---|
committer | Andrew Dunstan <andrew@dunslane.net> | 2018-03-26 22:39:24 +1030 |
commit | 2b27273435392d1606f0ffc95d73a439a457f08e (patch) | |
tree | 16c75ebf862e1a3167e9e4bbf44c72e1e26655bf /src/backend/access/nbtree/nbtinsert.c | |
parent | c515ff8d0a979fb553136a71388017c97785acda (diff) | |
download | postgresql-2b27273435392d1606f0ffc95d73a439a457f08e.tar.gz postgresql-2b27273435392d1606f0ffc95d73a439a457f08e.zip |
Optimize btree insertions for common case of increasing values
Remember the last page of an index insert if it's the rightmost leaf
page. If the next entry belongs on and can fit in the remembered page,
insert the new entry there as long as we can get a lock on the page.
Otherwise, fall back on the more expensive method of searching for
the right place to insert the entry.
This provides a performance improvement for the common case where an
index entry is for monotonically increasing or nearly monotonically
increasing value such as an identity field or a current timestamp.
Pavan Deolasee
Reviewed by Claudio Freire, Simon Riggs and Peter Geoghegan
Discussion: https://postgr.es/m/CABOikdM9DrupjyKZZFM5k8-0RCDs1wk6JzEkg7UgSW6QzOwMZw@mail.gmail.com
Diffstat (limited to 'src/backend/access/nbtree/nbtinsert.c')
-rw-r--r-- | src/backend/access/nbtree/nbtinsert.c | 138 |
1 files changed, 119 insertions, 19 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 2fe98673531..e85abcfd72d 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -23,6 +23,7 @@ #include "miscadmin.h" #include "storage/lmgr.h" #include "storage/predicate.h" +#include "storage/smgr.h" #include "utils/tqual.h" @@ -85,7 +86,6 @@ static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, int keysz, ScanKey scankey); static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel); - /* * _bt_doinsert() -- Handle insertion of a single index tuple in the tree. * @@ -111,32 +111,121 @@ _bt_doinsert(Relation rel, IndexTuple itup, bool is_unique = false; int natts = rel->rd_rel->relnatts; ScanKey itup_scankey; - BTStack stack; + BTStack stack = NULL; Buffer buf; OffsetNumber offset; + bool fastpath; /* we need an insertion scan key to do our search, so build one */ itup_scankey = _bt_mkscankey(rel, itup); + /* + * It's very common to have an index on an auto-incremented or + * monotonically increasing value. In such cases, every insertion happens + * towards the end of the index. We try to optimise that case by caching + * the right-most leaf of the index. If our cached block is still the + * rightmost leaf, has enough free space to accommodate a new entry and + * the insertion key is strictly greater than the first key in this page, + * then we can safely conclude that the new key will be inserted in the + * cached block. So we simply search within the cached block and insert the + * key at the appropriate location. We call it a fastpath. + * + * Testing has revealed, though, that the fastpath can result in increased + * contention on the exclusive-lock on the rightmost leaf page. So we + * conditionally check if the lock is available. If it's not available then + * we simply abandon the fastpath and take the regular path. This makes + * sense because unavailability of the lock also signals that some other + * backend might be concurrently inserting into the page, thus reducing our + * chances to finding an insertion place in this page. + */ top: - /* find the first page containing this key */ - stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE, NULL); - + fastpath = false; offset = InvalidOffsetNumber; + if (RelationGetTargetBlock(rel) != InvalidBlockNumber) + { + Size itemsz; + Page page; + BTPageOpaque lpageop; - /* trade in our read lock for a write lock */ - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - LockBuffer(buf, BT_WRITE); + /* + * Conditionally acquire exclusive lock on the buffer before doing any + * checks. If we don't get the lock, we simply follow slowpath. If we + * do get the lock, this ensures that the index state cannot change, as + * far as the rightmost part of the index is concerned. + */ + buf = ReadBuffer(rel, RelationGetTargetBlock(rel)); - /* - * If the page was split between the time that we surrendered our read - * lock and acquired our write lock, then this page may no longer be the - * right place for the key we want to insert. In this case, we need to - * move right in the tree. See Lehman and Yao for an excruciatingly - * precise description. - */ - buf = _bt_moveright(rel, buf, natts, itup_scankey, false, - true, stack, BT_WRITE, NULL); + if (ConditionalLockBuffer(buf)) + { + _bt_checkpage(rel, buf); + + page = BufferGetPage(buf); + + lpageop = (BTPageOpaque) PageGetSpecialPointer(page); + itemsz = IndexTupleSize(itup); + itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this + * but we need to be consistent */ + + /* + * Check if the page is still the rightmost leaf page, has enough + * free space to accommodate the new tuple, no split is in progress + * and the scankey is greater than or equal to the first key on the + * page. + */ + if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) && + !P_INCOMPLETE_SPLIT(lpageop) && + !P_IGNORE(lpageop) && + (PageGetFreeSpace(page) > itemsz) && + PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) && + _bt_compare(rel, natts, itup_scankey, page, + P_FIRSTDATAKEY(lpageop)) > 0) + { + fastpath = true; + } + else + { + _bt_relbuf(rel, buf); + + /* + * Something did not workout. Just forget about the cached + * block and follow the normal path. It might be set again if + * the conditions are favourble. + */ + RelationSetTargetBlock(rel, InvalidBlockNumber); + } + } + else + { + ReleaseBuffer(buf); + + /* + * If someone's holding a lock, it's likely to change anyway, + * so don't try again until we get an updated rightmost leaf. + */ + RelationSetTargetBlock(rel, InvalidBlockNumber); + } + } + + if (!fastpath) + { + /* find the first page containing this key */ + stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE, + NULL); + + /* trade in our read lock for a write lock */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBuffer(buf, BT_WRITE); + + /* + * If the page was split between the time that we surrendered our read + * lock and acquired our write lock, then this page may no longer be + * the right place for the key we want to insert. In this case, we + * need to move right in the tree. See Lehman and Yao for an + * excruciatingly precise description. + */ + buf = _bt_moveright(rel, buf, natts, itup_scankey, false, + true, stack, BT_WRITE, NULL); + } /* * If we're not allowing duplicates, make sure the key isn't already in @@ -184,7 +273,8 @@ top: XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex); /* start over... */ - _bt_freestack(stack); + if (stack) + _bt_freestack(stack); goto top; } } @@ -211,7 +301,8 @@ top: } /* be tidy */ - _bt_freestack(stack); + if (stack) + _bt_freestack(stack); _bt_freeskey(itup_scankey); return is_unique; @@ -879,7 +970,16 @@ _bt_insertonpg(Relation rel, XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert); if (P_ISLEAF(lpageop)) + { xlinfo = XLOG_BTREE_INSERT_LEAF; + + /* + * Cache the block information if we just inserted into the + * rightmost leaf page of the index. + */ + if (P_RIGHTMOST(lpageop)) + RelationSetTargetBlock(rel, BufferGetBlockNumber(buf)); + } else { /* |