diff options
-rw-r--r-- | contrib/pageinspect/expected/hash.out | 8 | ||||
-rw-r--r-- | doc/src/sgml/pageinspect.sgml | 2 | ||||
-rw-r--r-- | src/backend/access/hash/README | 68 | ||||
-rw-r--r-- | src/backend/access/hash/hash.c | 59 | ||||
-rw-r--r-- | src/backend/access/hash/hashinsert.c | 83 | ||||
-rw-r--r-- | src/backend/access/hash/hashpage.c | 151 | ||||
-rw-r--r-- | src/backend/access/hash/hashsearch.c | 69 | ||||
-rw-r--r-- | src/include/access/hash.h | 20 |
8 files changed, 279 insertions, 181 deletions
diff --git a/contrib/pageinspect/expected/hash.out b/contrib/pageinspect/expected/hash.out index 31b67999fc8..7eb1537b29e 100644 --- a/contrib/pageinspect/expected/hash.out +++ b/contrib/pageinspect/expected/hash.out @@ -98,7 +98,7 @@ hash_page_stats(get_raw_page('test_hash_a_idx', 1)); live_items | 0 dead_items | 0 page_size | 8192 -hasho_prevblkno | 4294967295 +hasho_prevblkno | 3 hasho_nextblkno | 4294967295 hasho_bucket | 0 hasho_flag | 2 @@ -111,7 +111,7 @@ hash_page_stats(get_raw_page('test_hash_a_idx', 2)); live_items | 0 dead_items | 0 page_size | 8192 -hasho_prevblkno | 4294967295 +hasho_prevblkno | 3 hasho_nextblkno | 4294967295 hasho_bucket | 1 hasho_flag | 2 @@ -124,7 +124,7 @@ hash_page_stats(get_raw_page('test_hash_a_idx', 3)); live_items | 1 dead_items | 0 page_size | 8192 -hasho_prevblkno | 4294967295 +hasho_prevblkno | 3 hasho_nextblkno | 4294967295 hasho_bucket | 2 hasho_flag | 2 @@ -137,7 +137,7 @@ hash_page_stats(get_raw_page('test_hash_a_idx', 4)); live_items | 0 dead_items | 0 page_size | 8192 -hasho_prevblkno | 4294967295 +hasho_prevblkno | 3 hasho_nextblkno | 4294967295 hasho_bucket | 3 hasho_flag | 2 diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml index 4c201e75b0d..5e6712f9cde 100644 --- a/doc/src/sgml/pageinspect.sgml +++ b/doc/src/sgml/pageinspect.sgml @@ -539,7 +539,7 @@ live_items | 407 dead_items | 0 page_size | 8192 free_size | 8 -hasho_prevblkno | 4294967295 +hasho_prevblkno | 4096 hasho_nextblkno | 8474 hasho_bucket | 0 hasho_flag | 66 diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README index 01ea115f4d4..703ae982071 100644 --- a/src/backend/access/hash/README +++ b/src/backend/access/hash/README @@ -149,6 +149,50 @@ We choose to always lock the lower-numbered bucket first. The metapage is only ever locked after all bucket locks have been taken. +Metapage Caching +---------------- + +Both scanning the index and inserting tuples require locating the bucket +where a given tuple ought to be located. To do this, we need the bucket +count, highmask, and lowmask from the metapage; however, it's undesirable +for performance reasons to have to have to lock and pin the metapage for +every such operation. Instead, we retain a cached copy of the metapage +in each each backend's relcache entry. This will produce the correct +bucket mapping as long as the target bucket hasn't been split since the +last cache refresh. + +To guard against the possibility that such a split has occurred, the +primary page of each bucket chain stores the number of buckets that +existed as of the time the bucket was last split, or if never split as +of the time it was created, in the space normally used for the +previous block number (that is, hasho_prevblkno). This doesn't cost +anything because the primary bucket page is always the first page in +the chain, and the previous block number is therefore always, in +reality, InvalidBlockNumber. + +After computing the ostensibly-correct bucket number based on our cached +copy of the metapage, we lock the corresponding primary bucket page and +check whether the bucket count stored in hasho_prevblkno is greater than +our the number of buckets stored in our cached copy of the metapage. If +so, the bucket has certainly been split, because the must originally +have been less than the number of buckets that existed at that time and +can't have increased except due to a split. If not, the bucket can't have +been split, because a split would have created a new bucket with a higher +bucket number than any we'd seen previously. In the latter case, we've +locked the correct bucket and can proceed; in the former case, we must +release the lock on this bucket, lock the metapage, update our cache, +unlock the metapage, and retry. + +Needing to retry occasionally might seem expensive, but the number of times +any given bucket can be split is limited to a few dozen no matter how +many times the hash index is accessed, because the total number of +buckets is limited to less than 2^32. On the other hand, the number of +times we access a bucket is unbounded and will be several orders of +magnitude larger even in unsympathetic cases. + +(The metapage cache is new in v10. Older hash indexes had the primary +bucket page's hasho_prevblkno initialized to InvalidBuffer.) + Pseudocode Algorithms --------------------- @@ -188,17 +232,7 @@ track of available overflow pages. The reader algorithm is: - pin meta page and take buffer content lock in shared mode - loop: - compute bucket number for target hash key - release meta page buffer content lock - if (correct bucket page is already locked) - break - release any existing bucket page buffer content lock (if a concurrent - split happened) - take the buffer content lock on bucket page in shared mode - retake meta page buffer content lock in shared mode - release pin on metapage + lock the primary bucket page of the target bucket if the target bucket is still being populated by a split: release the buffer content lock on current bucket page pin and acquire the buffer content lock on old bucket in shared mode @@ -238,17 +272,7 @@ which this bucket is formed by split. The insertion algorithm is rather similar: - pin meta page and take buffer content lock in shared mode - loop: - compute bucket number for target hash key - release meta page buffer content lock - if (correct bucket page is already locked) - break - release any existing bucket page buffer content lock (if a concurrent - split happened) - take the buffer content lock on bucket page in exclusive mode - retake meta page buffer content lock in shared mode - release pin on metapage + lock the primary bucket page of the target bucket -- (so far same as reader, except for acquisition of buffer content lock in exclusive mode on primary bucket page) if the bucket-being-split flag is set for a bucket and pin count on it is diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index ec8ed33c708..97ad22aa6f3 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -507,28 +507,24 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; - Buffer metabuf; + Buffer metabuf = InvalidBuffer; HashMetaPage metap; - HashMetaPageData local_metapage; + HashMetaPage cachedmetap; tuples_removed = 0; num_index_tuples = 0; /* - * Read the metapage to fetch original bucket and tuple counts. Also, we - * keep a copy of the last-seen metapage so that we can use its - * hashm_spares[] values to compute bucket page addresses. This is a bit - * hokey but perfectly safe, since the interesting entries in the spares - * array cannot change under us; and it beats rereading the metapage for - * each bucket. + * We need a copy of the metapage so that we can use its hashm_spares[] + * values to compute bucket page addresses, but a cached copy should be + * good enough. (If not, we'll detect that further down and refresh the + * cache as necessary.) */ - metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); - metap = HashPageGetMeta(BufferGetPage(metabuf)); - orig_maxbucket = metap->hashm_maxbucket; - orig_ntuples = metap->hashm_ntuples; - memcpy(&local_metapage, metap, sizeof(local_metapage)); - /* release the lock, but keep pin */ - LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + cachedmetap = _hash_getcachedmetap(rel, &metabuf, false); + Assert(cachedmetap != NULL); + + orig_maxbucket = cachedmetap->hashm_maxbucket; + orig_ntuples = cachedmetap->hashm_ntuples; /* Scan the buckets that we know exist */ cur_bucket = 0; @@ -546,7 +542,7 @@ loop_top: bool split_cleanup = false; /* Get address of bucket's start page */ - bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket); + bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket); blkno = bucket_blkno; @@ -577,20 +573,27 @@ loop_top: * hashm_lowmask might be old enough to cause us to fail to remove * tuples left behind by the most recent split. To prevent that, * now that the primary page of the target bucket has been locked - * (and thus can't be further split), update our cached metapage - * data. + * (and thus can't be further split), check whether we need to + * update our cached metapage data. + * + * NB: The check for InvalidBlockNumber is only needed for + * on-disk compatibility with indexes created before we started + * storing hashm_maxbucket in the primary page's hasho_prevblkno. */ - LockBuffer(metabuf, BUFFER_LOCK_SHARE); - memcpy(&local_metapage, metap, sizeof(local_metapage)); - LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + if (bucket_opaque->hasho_prevblkno != InvalidBlockNumber && + bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket) + { + cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); + Assert(cachedmetap != NULL); + } } bucket_buf = buf; hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy, - local_metapage.hashm_maxbucket, - local_metapage.hashm_highmask, - local_metapage.hashm_lowmask, &tuples_removed, + cachedmetap->hashm_maxbucket, + cachedmetap->hashm_highmask, + cachedmetap->hashm_lowmask, &tuples_removed, &num_index_tuples, split_cleanup, callback, callback_state); @@ -600,6 +603,9 @@ loop_top: cur_bucket++; } + if (BufferIsInvalid(metabuf)) + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); + /* Write-lock metapage and check for split since we started */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); metap = HashPageGetMeta(BufferGetPage(metabuf)); @@ -607,9 +613,10 @@ loop_top: if (cur_maxbucket != metap->hashm_maxbucket) { /* There's been a split, so process the additional bucket(s) */ - cur_maxbucket = metap->hashm_maxbucket; - memcpy(&local_metapage, metap, sizeof(local_metapage)); LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); + Assert(cachedmetap != NULL); + cur_maxbucket = cachedmetap->hashm_maxbucket; goto loop_top; } diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c index 39c70d3a80f..dc63063ac1f 100644 --- a/src/backend/access/hash/hashinsert.c +++ b/src/backend/access/hash/hashinsert.c @@ -32,9 +32,7 @@ _hash_doinsert(Relation rel, IndexTuple itup) Buffer bucket_buf; Buffer metabuf; HashMetaPage metap; - BlockNumber blkno; - BlockNumber oldblkno; - bool retry; + HashMetaPage usedmetap = NULL; Page metapage; Page page; HashPageOpaque pageopaque; @@ -42,9 +40,6 @@ _hash_doinsert(Relation rel, IndexTuple itup) bool do_expand; uint32 hashkey; Bucket bucket; - uint32 maxbucket; - uint32 highmask; - uint32 lowmask; /* * Get the hash key for the item (it's stored in the index tuple itself). @@ -57,10 +52,14 @@ _hash_doinsert(Relation rel, IndexTuple itup) * need to be consistent */ restart_insert: - /* Read the metapage */ - metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + + /* + * Read the metapage. We don't lock it yet; HashMaxItemSize() will + * examine pd_pagesize_version, but that can't change so we can examine + * it without a lock. + */ + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); metapage = BufferGetPage(metabuf); - metap = HashPageGetMeta(metapage); /* * Check whether the item can fit on a hash page at all. (Eventually, we @@ -76,66 +75,17 @@ restart_insert: itemsz, HashMaxItemSize(metapage)), errhint("Values larger than a buffer page cannot be indexed."))); - oldblkno = InvalidBlockNumber; - retry = false; - - /* - * Loop until we get a lock on the correct target bucket. - */ - for (;;) - { - /* - * Compute the target bucket number, and convert to block number. - */ - bucket = _hash_hashkey2bucket(hashkey, - metap->hashm_maxbucket, - metap->hashm_highmask, - metap->hashm_lowmask); - - blkno = BUCKET_TO_BLKNO(metap, bucket); - - /* - * Copy bucket mapping info now; refer the comment in - * _hash_expandtable where we copy this information before calling - * _hash_splitbucket to see why this is okay. - */ - maxbucket = metap->hashm_maxbucket; - highmask = metap->hashm_highmask; - lowmask = metap->hashm_lowmask; - - /* Release metapage lock, but keep pin. */ - LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); - - /* - * If the previous iteration of this loop locked the primary page of - * what is still the correct target bucket, we are done. Otherwise, - * drop any old lock before acquiring the new one. - */ - if (retry) - { - if (oldblkno == blkno) - break; - _hash_relbuf(rel, buf); - } - - /* Fetch and lock the primary bucket page for the target bucket */ - buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE); - - /* - * Reacquire metapage lock and check that no bucket split has taken - * place while we were awaiting the bucket lock. - */ - LockBuffer(metabuf, BUFFER_LOCK_SHARE); - oldblkno = blkno; - retry = true; - } + /* Lock the primary bucket page for the target bucket. */ + buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE, + &usedmetap); + Assert(usedmetap != NULL); /* remember the primary bucket buffer to release the pin on it at end. */ bucket_buf = buf; page = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); - Assert(pageopaque->hasho_bucket == bucket); + bucket = pageopaque->hasho_bucket; /* * If this bucket is in the process of being split, try to finish the @@ -151,8 +101,10 @@ restart_insert: /* release the lock on bucket buffer, before completing the split. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); - _hash_finish_split(rel, metabuf, buf, pageopaque->hasho_bucket, - maxbucket, highmask, lowmask); + _hash_finish_split(rel, metabuf, buf, bucket, + usedmetap->hashm_maxbucket, + usedmetap->hashm_highmask, + usedmetap->hashm_lowmask); /* release the pin on old and meta buffer. retry for insert. */ _hash_dropbuf(rel, buf); @@ -225,6 +177,7 @@ restart_insert: */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + metap = HashPageGetMeta(metapage); metap->hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 69676eba953..d52f149389b 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -434,7 +434,13 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum); pg = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); - pageopaque->hasho_prevblkno = InvalidBlockNumber; + + /* + * Set hasho_prevblkno with current hashm_maxbucket. This value will + * be used to validate cached HashMetaPageData. See + * _hash_getbucketbuf_from_hashkey(). + */ + pageopaque->hasho_prevblkno = metap->hashm_maxbucket; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = i; pageopaque->hasho_flag = LH_BUCKET_PAGE; @@ -840,10 +846,14 @@ _hash_splitbucket(Relation rel, oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); /* - * Mark the old bucket to indicate that split is in progress. At - * operation end, we clear split-in-progress flag. + * Mark the old bucket to indicate that split is in progress. (At + * operation end, we will clear the split-in-progress flag.) Also, + * for a primary bucket page, hasho_prevblkno stores the number of + * buckets that existed as of the last split, so we must update that + * value here. */ oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT; + oopaque->hasho_prevblkno = maxbucket; npage = BufferGetPage(nbuf); @@ -852,7 +862,7 @@ _hash_splitbucket(Relation rel, * split is in progress. */ nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); - nopaque->hasho_prevblkno = InvalidBlockNumber; + nopaque->hasho_prevblkno = maxbucket; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = nbucket; nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED; @@ -1191,3 +1201,136 @@ _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, LockBuffer(obuf, BUFFER_LOCK_UNLOCK); hash_destroy(tidhtab); } + +/* + * _hash_getcachedmetap() -- Returns cached metapage data. + * + * If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on + * the metapage. If not set, we'll set it before returning if we have to + * refresh the cache, and return with a pin but no lock on it; caller is + * responsible for releasing the pin. + * + * We refresh the cache if it's not initialized yet or force_refresh is true. + */ +HashMetaPage +_hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh) +{ + Page page; + + Assert(metabuf); + if (force_refresh || rel->rd_amcache == NULL) + { + char *cache; + + /* + * It's important that we don't set rd_amcache to an invalid + * value. Either MemoryContextAlloc or _hash_getbuf could fail, + * so don't install a pointer to the newly-allocated storage in the + * actual relcache entry until both have succeeeded. + */ + if (rel->rd_amcache == NULL) + cache = MemoryContextAlloc(rel->rd_indexcxt, + sizeof(HashMetaPageData)); + + /* Read the metapage. */ + if (BufferIsValid(*metabuf)) + LockBuffer(*metabuf, BUFFER_LOCK_SHARE); + else + *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, + LH_META_PAGE); + page = BufferGetPage(*metabuf); + + /* Populate the cache. */ + if (rel->rd_amcache == NULL) + rel->rd_amcache = cache; + memcpy(rel->rd_amcache, HashPageGetMeta(page), + sizeof(HashMetaPageData)); + + /* Release metapage lock, but keep the pin. */ + LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK); + } + + return (HashMetaPage) rel->rd_amcache; +} + +/* + * _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given + * hashkey. + * + * Bucket pages do not move or get removed once they are allocated. This give + * us an opportunity to use the previously saved metapage contents to reach + * the target bucket buffer, instead of reading from the metapage every time. + * This saves one buffer access every time we want to reach the target bucket + * buffer, which is very helpful savings in bufmgr traffic and contention. + * + * The access type parameter (HASH_READ or HASH_WRITE) indicates whether the + * bucket buffer has to be locked for reading or writing. + * + * The out parameter cachedmetap is set with metapage contents used for + * hashkey to bucket buffer mapping. Some callers need this info to reach the + * old bucket in case of bucket split, see _hash_doinsert(). + */ +Buffer +_hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access, + HashMetaPage *cachedmetap) +{ + HashMetaPage metap; + Buffer buf; + Buffer metabuf = InvalidBuffer; + Page page; + Bucket bucket; + BlockNumber blkno; + HashPageOpaque opaque; + + /* We read from target bucket buffer, hence locking is must. */ + Assert(access == HASH_READ || access == HASH_WRITE); + + metap = _hash_getcachedmetap(rel, &metabuf, false); + Assert(metap != NULL); + + /* + * Loop until we get a lock on the correct target bucket. + */ + for (;;) + { + /* + * Compute the target bucket number, and convert to block number. + */ + bucket = _hash_hashkey2bucket(hashkey, + metap->hashm_maxbucket, + metap->hashm_highmask, + metap->hashm_lowmask); + + blkno = BUCKET_TO_BLKNO(metap, bucket); + + /* Fetch the primary bucket page for the bucket */ + buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE); + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->hasho_bucket == bucket); + + /* + * If this bucket hasn't been split, we're done. + * + * NB: The check for InvalidBlockNumber is only needed for on-disk + * compatibility with indexes created before we started storing + * hashm_maxbucket in the primary page's hasho_prevblkno. + */ + if (opaque->hasho_prevblkno == InvalidBlockNumber || + opaque->hasho_prevblkno <= metap->hashm_maxbucket) + break; + + /* Drop lock on this buffer, update cached metapage, and retry. */ + _hash_relbuf(rel, buf); + metap = _hash_getcachedmetap(rel, &metabuf, true); + Assert(metap != NULL); + } + + if (BufferIsValid(metabuf)) + _hash_dropbuf(rel, metabuf); + + if (cachedmetap) + *cachedmetap = metap; + + return buf; +} diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index a59ad6ff707..9e5d7e4babe 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -139,6 +139,7 @@ _hash_readprev(IndexScanDesc scan, BlockNumber blkno; Relation rel = scan->indexRelation; HashScanOpaque so = (HashScanOpaque) scan->opaque; + bool haveprevblk; blkno = (*opaquep)->hasho_prevblkno; @@ -147,15 +148,23 @@ _hash_readprev(IndexScanDesc scan, * comments in _hash_first to know the reason of retaining pin. */ if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf) + { LockBuffer(*bufp, BUFFER_LOCK_UNLOCK); + haveprevblk = false; + } else + { _hash_relbuf(rel, *bufp); + haveprevblk = true; + } *bufp = InvalidBuffer; /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); - if (BlockNumberIsValid(blkno)) + + if (haveprevblk) { + Assert(BlockNumberIsValid(blkno)); *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); *pagep = BufferGetPage(*bufp); @@ -215,14 +224,9 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) ScanKey cur; uint32 hashkey; Bucket bucket; - BlockNumber blkno; - BlockNumber oldblkno = InvalidBuffer; - bool retry = false; Buffer buf; - Buffer metabuf; Page page; HashPageOpaque opaque; - HashMetaPage metap; IndexTuple itup; ItemPointer current; OffsetNumber offnum; @@ -277,59 +281,10 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) so->hashso_sk_hash = hashkey; - /* Read the metapage */ - metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); - page = BufferGetPage(metabuf); - metap = HashPageGetMeta(page); - - /* - * Loop until we get a lock on the correct target bucket. - */ - for (;;) - { - /* - * Compute the target bucket number, and convert to block number. - */ - bucket = _hash_hashkey2bucket(hashkey, - metap->hashm_maxbucket, - metap->hashm_highmask, - metap->hashm_lowmask); - - blkno = BUCKET_TO_BLKNO(metap, bucket); - - /* Release metapage lock, but keep pin. */ - LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); - - /* - * If the previous iteration of this loop locked what is still the - * correct target bucket, we are done. Otherwise, drop any old lock - * and lock what now appears to be the correct bucket. - */ - if (retry) - { - if (oldblkno == blkno) - break; - _hash_relbuf(rel, buf); - } - - /* Fetch the primary bucket page for the bucket */ - buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE); - - /* - * Reacquire metapage lock and check that no bucket split has taken - * place while we were awaiting the bucket lock. - */ - LockBuffer(metabuf, BUFFER_LOCK_SHARE); - oldblkno = blkno; - retry = true; - } - - /* done with the metapage */ - _hash_dropbuf(rel, metabuf); - + buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); - Assert(opaque->hasho_bucket == bucket); + bucket = opaque->hasho_bucket; so->hashso_bucket_buf = buf; diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 1a9b91f9f53..c0455851f46 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -61,10 +61,21 @@ typedef uint32 Bucket; #define LH_PAGE_TYPE \ (LH_OVERFLOW_PAGE|LH_BUCKET_PAGE|LH_BITMAP_PAGE|LH_META_PAGE) +/* + * In an overflow page, hasho_prevblkno stores the block number of the previous + * page in the bucket chain; in a bucket page, hasho_prevblkno stores the + * hashm_maxbucket value as of the last time the bucket was last split, or + * else as of the time the bucket was created. The latter convention is used + * to determine whether a cached copy of the metapage is too stale to be used + * without needing to lock or pin the metapage. + * + * hasho_nextblkno is always the block number of the next page in the + * bucket chain, or InvalidBlockNumber if there are no more such pages. + */ typedef struct HashPageOpaqueData { - BlockNumber hasho_prevblkno; /* previous ovfl (or bucket) blkno */ - BlockNumber hasho_nextblkno; /* next ovfl blkno */ + BlockNumber hasho_prevblkno; /* see above */ + BlockNumber hasho_nextblkno; /* see above */ Bucket hasho_bucket; /* bucket number this pg belongs to */ uint16 hasho_flag; /* page type code, see above */ uint16 hasho_page_id; /* for identification of hash indexes */ @@ -309,6 +320,11 @@ extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags); extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags); +extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, + bool force_refresh); +extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, + int access, + HashMetaPage *cachedmetap); extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno); extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum); |