aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--contrib/pageinspect/expected/hash.out8
-rw-r--r--doc/src/sgml/pageinspect.sgml2
-rw-r--r--src/backend/access/hash/README68
-rw-r--r--src/backend/access/hash/hash.c59
-rw-r--r--src/backend/access/hash/hashinsert.c83
-rw-r--r--src/backend/access/hash/hashpage.c151
-rw-r--r--src/backend/access/hash/hashsearch.c69
-rw-r--r--src/include/access/hash.h20
8 files changed, 279 insertions, 181 deletions
diff --git a/contrib/pageinspect/expected/hash.out b/contrib/pageinspect/expected/hash.out
index 31b67999fc8..7eb1537b29e 100644
--- a/contrib/pageinspect/expected/hash.out
+++ b/contrib/pageinspect/expected/hash.out
@@ -98,7 +98,7 @@ hash_page_stats(get_raw_page('test_hash_a_idx', 1));
live_items | 0
dead_items | 0
page_size | 8192
-hasho_prevblkno | 4294967295
+hasho_prevblkno | 3
hasho_nextblkno | 4294967295
hasho_bucket | 0
hasho_flag | 2
@@ -111,7 +111,7 @@ hash_page_stats(get_raw_page('test_hash_a_idx', 2));
live_items | 0
dead_items | 0
page_size | 8192
-hasho_prevblkno | 4294967295
+hasho_prevblkno | 3
hasho_nextblkno | 4294967295
hasho_bucket | 1
hasho_flag | 2
@@ -124,7 +124,7 @@ hash_page_stats(get_raw_page('test_hash_a_idx', 3));
live_items | 1
dead_items | 0
page_size | 8192
-hasho_prevblkno | 4294967295
+hasho_prevblkno | 3
hasho_nextblkno | 4294967295
hasho_bucket | 2
hasho_flag | 2
@@ -137,7 +137,7 @@ hash_page_stats(get_raw_page('test_hash_a_idx', 4));
live_items | 0
dead_items | 0
page_size | 8192
-hasho_prevblkno | 4294967295
+hasho_prevblkno | 3
hasho_nextblkno | 4294967295
hasho_bucket | 3
hasho_flag | 2
diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml
index 4c201e75b0d..5e6712f9cde 100644
--- a/doc/src/sgml/pageinspect.sgml
+++ b/doc/src/sgml/pageinspect.sgml
@@ -539,7 +539,7 @@ live_items | 407
dead_items | 0
page_size | 8192
free_size | 8
-hasho_prevblkno | 4294967295
+hasho_prevblkno | 4096
hasho_nextblkno | 8474
hasho_bucket | 0
hasho_flag | 66
diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README
index 01ea115f4d4..703ae982071 100644
--- a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
@@ -149,6 +149,50 @@ We choose to always lock the lower-numbered bucket first. The metapage is
only ever locked after all bucket locks have been taken.
+Metapage Caching
+----------------
+
+Both scanning the index and inserting tuples require locating the bucket
+where a given tuple ought to be located. To do this, we need the bucket
+count, highmask, and lowmask from the metapage; however, it's undesirable
+for performance reasons to have to have to lock and pin the metapage for
+every such operation. Instead, we retain a cached copy of the metapage
+in each each backend's relcache entry. This will produce the correct
+bucket mapping as long as the target bucket hasn't been split since the
+last cache refresh.
+
+To guard against the possibility that such a split has occurred, the
+primary page of each bucket chain stores the number of buckets that
+existed as of the time the bucket was last split, or if never split as
+of the time it was created, in the space normally used for the
+previous block number (that is, hasho_prevblkno). This doesn't cost
+anything because the primary bucket page is always the first page in
+the chain, and the previous block number is therefore always, in
+reality, InvalidBlockNumber.
+
+After computing the ostensibly-correct bucket number based on our cached
+copy of the metapage, we lock the corresponding primary bucket page and
+check whether the bucket count stored in hasho_prevblkno is greater than
+our the number of buckets stored in our cached copy of the metapage. If
+so, the bucket has certainly been split, because the must originally
+have been less than the number of buckets that existed at that time and
+can't have increased except due to a split. If not, the bucket can't have
+been split, because a split would have created a new bucket with a higher
+bucket number than any we'd seen previously. In the latter case, we've
+locked the correct bucket and can proceed; in the former case, we must
+release the lock on this bucket, lock the metapage, update our cache,
+unlock the metapage, and retry.
+
+Needing to retry occasionally might seem expensive, but the number of times
+any given bucket can be split is limited to a few dozen no matter how
+many times the hash index is accessed, because the total number of
+buckets is limited to less than 2^32. On the other hand, the number of
+times we access a bucket is unbounded and will be several orders of
+magnitude larger even in unsympathetic cases.
+
+(The metapage cache is new in v10. Older hash indexes had the primary
+bucket page's hasho_prevblkno initialized to InvalidBuffer.)
+
Pseudocode Algorithms
---------------------
@@ -188,17 +232,7 @@ track of available overflow pages.
The reader algorithm is:
- pin meta page and take buffer content lock in shared mode
- loop:
- compute bucket number for target hash key
- release meta page buffer content lock
- if (correct bucket page is already locked)
- break
- release any existing bucket page buffer content lock (if a concurrent
- split happened)
- take the buffer content lock on bucket page in shared mode
- retake meta page buffer content lock in shared mode
- release pin on metapage
+ lock the primary bucket page of the target bucket
if the target bucket is still being populated by a split:
release the buffer content lock on current bucket page
pin and acquire the buffer content lock on old bucket in shared mode
@@ -238,17 +272,7 @@ which this bucket is formed by split.
The insertion algorithm is rather similar:
- pin meta page and take buffer content lock in shared mode
- loop:
- compute bucket number for target hash key
- release meta page buffer content lock
- if (correct bucket page is already locked)
- break
- release any existing bucket page buffer content lock (if a concurrent
- split happened)
- take the buffer content lock on bucket page in exclusive mode
- retake meta page buffer content lock in shared mode
- release pin on metapage
+ lock the primary bucket page of the target bucket
-- (so far same as reader, except for acquisition of buffer content lock in
exclusive mode on primary bucket page)
if the bucket-being-split flag is set for a bucket and pin count on it is
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index ec8ed33c708..97ad22aa6f3 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -507,28 +507,24 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
Bucket orig_maxbucket;
Bucket cur_maxbucket;
Bucket cur_bucket;
- Buffer metabuf;
+ Buffer metabuf = InvalidBuffer;
HashMetaPage metap;
- HashMetaPageData local_metapage;
+ HashMetaPage cachedmetap;
tuples_removed = 0;
num_index_tuples = 0;
/*
- * Read the metapage to fetch original bucket and tuple counts. Also, we
- * keep a copy of the last-seen metapage so that we can use its
- * hashm_spares[] values to compute bucket page addresses. This is a bit
- * hokey but perfectly safe, since the interesting entries in the spares
- * array cannot change under us; and it beats rereading the metapage for
- * each bucket.
+ * We need a copy of the metapage so that we can use its hashm_spares[]
+ * values to compute bucket page addresses, but a cached copy should be
+ * good enough. (If not, we'll detect that further down and refresh the
+ * cache as necessary.)
*/
- metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
- metap = HashPageGetMeta(BufferGetPage(metabuf));
- orig_maxbucket = metap->hashm_maxbucket;
- orig_ntuples = metap->hashm_ntuples;
- memcpy(&local_metapage, metap, sizeof(local_metapage));
- /* release the lock, but keep pin */
- LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+ cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
+ Assert(cachedmetap != NULL);
+
+ orig_maxbucket = cachedmetap->hashm_maxbucket;
+ orig_ntuples = cachedmetap->hashm_ntuples;
/* Scan the buckets that we know exist */
cur_bucket = 0;
@@ -546,7 +542,7 @@ loop_top:
bool split_cleanup = false;
/* Get address of bucket's start page */
- bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
+ bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
blkno = bucket_blkno;
@@ -577,20 +573,27 @@ loop_top:
* hashm_lowmask might be old enough to cause us to fail to remove
* tuples left behind by the most recent split. To prevent that,
* now that the primary page of the target bucket has been locked
- * (and thus can't be further split), update our cached metapage
- * data.
+ * (and thus can't be further split), check whether we need to
+ * update our cached metapage data.
+ *
+ * NB: The check for InvalidBlockNumber is only needed for
+ * on-disk compatibility with indexes created before we started
+ * storing hashm_maxbucket in the primary page's hasho_prevblkno.
*/
- LockBuffer(metabuf, BUFFER_LOCK_SHARE);
- memcpy(&local_metapage, metap, sizeof(local_metapage));
- LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+ if (bucket_opaque->hasho_prevblkno != InvalidBlockNumber &&
+ bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
+ {
+ cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
+ Assert(cachedmetap != NULL);
+ }
}
bucket_buf = buf;
hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
- local_metapage.hashm_maxbucket,
- local_metapage.hashm_highmask,
- local_metapage.hashm_lowmask, &tuples_removed,
+ cachedmetap->hashm_maxbucket,
+ cachedmetap->hashm_highmask,
+ cachedmetap->hashm_lowmask, &tuples_removed,
&num_index_tuples, split_cleanup,
callback, callback_state);
@@ -600,6 +603,9 @@ loop_top:
cur_bucket++;
}
+ if (BufferIsInvalid(metabuf))
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
+
/* Write-lock metapage and check for split since we started */
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
metap = HashPageGetMeta(BufferGetPage(metabuf));
@@ -607,9 +613,10 @@ loop_top:
if (cur_maxbucket != metap->hashm_maxbucket)
{
/* There's been a split, so process the additional bucket(s) */
- cur_maxbucket = metap->hashm_maxbucket;
- memcpy(&local_metapage, metap, sizeof(local_metapage));
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+ cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
+ Assert(cachedmetap != NULL);
+ cur_maxbucket = cachedmetap->hashm_maxbucket;
goto loop_top;
}
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
index 39c70d3a80f..dc63063ac1f 100644
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -32,9 +32,7 @@ _hash_doinsert(Relation rel, IndexTuple itup)
Buffer bucket_buf;
Buffer metabuf;
HashMetaPage metap;
- BlockNumber blkno;
- BlockNumber oldblkno;
- bool retry;
+ HashMetaPage usedmetap = NULL;
Page metapage;
Page page;
HashPageOpaque pageopaque;
@@ -42,9 +40,6 @@ _hash_doinsert(Relation rel, IndexTuple itup)
bool do_expand;
uint32 hashkey;
Bucket bucket;
- uint32 maxbucket;
- uint32 highmask;
- uint32 lowmask;
/*
* Get the hash key for the item (it's stored in the index tuple itself).
@@ -57,10 +52,14 @@ _hash_doinsert(Relation rel, IndexTuple itup)
* need to be consistent */
restart_insert:
- /* Read the metapage */
- metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+
+ /*
+ * Read the metapage. We don't lock it yet; HashMaxItemSize() will
+ * examine pd_pagesize_version, but that can't change so we can examine
+ * it without a lock.
+ */
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
metapage = BufferGetPage(metabuf);
- metap = HashPageGetMeta(metapage);
/*
* Check whether the item can fit on a hash page at all. (Eventually, we
@@ -76,66 +75,17 @@ restart_insert:
itemsz, HashMaxItemSize(metapage)),
errhint("Values larger than a buffer page cannot be indexed.")));
- oldblkno = InvalidBlockNumber;
- retry = false;
-
- /*
- * Loop until we get a lock on the correct target bucket.
- */
- for (;;)
- {
- /*
- * Compute the target bucket number, and convert to block number.
- */
- bucket = _hash_hashkey2bucket(hashkey,
- metap->hashm_maxbucket,
- metap->hashm_highmask,
- metap->hashm_lowmask);
-
- blkno = BUCKET_TO_BLKNO(metap, bucket);
-
- /*
- * Copy bucket mapping info now; refer the comment in
- * _hash_expandtable where we copy this information before calling
- * _hash_splitbucket to see why this is okay.
- */
- maxbucket = metap->hashm_maxbucket;
- highmask = metap->hashm_highmask;
- lowmask = metap->hashm_lowmask;
-
- /* Release metapage lock, but keep pin. */
- LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-
- /*
- * If the previous iteration of this loop locked the primary page of
- * what is still the correct target bucket, we are done. Otherwise,
- * drop any old lock before acquiring the new one.
- */
- if (retry)
- {
- if (oldblkno == blkno)
- break;
- _hash_relbuf(rel, buf);
- }
-
- /* Fetch and lock the primary bucket page for the target bucket */
- buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
-
- /*
- * Reacquire metapage lock and check that no bucket split has taken
- * place while we were awaiting the bucket lock.
- */
- LockBuffer(metabuf, BUFFER_LOCK_SHARE);
- oldblkno = blkno;
- retry = true;
- }
+ /* Lock the primary bucket page for the target bucket. */
+ buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE,
+ &usedmetap);
+ Assert(usedmetap != NULL);
/* remember the primary bucket buffer to release the pin on it at end. */
bucket_buf = buf;
page = BufferGetPage(buf);
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
- Assert(pageopaque->hasho_bucket == bucket);
+ bucket = pageopaque->hasho_bucket;
/*
* If this bucket is in the process of being split, try to finish the
@@ -151,8 +101,10 @@ restart_insert:
/* release the lock on bucket buffer, before completing the split. */
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
- _hash_finish_split(rel, metabuf, buf, pageopaque->hasho_bucket,
- maxbucket, highmask, lowmask);
+ _hash_finish_split(rel, metabuf, buf, bucket,
+ usedmetap->hashm_maxbucket,
+ usedmetap->hashm_highmask,
+ usedmetap->hashm_lowmask);
/* release the pin on old and meta buffer. retry for insert. */
_hash_dropbuf(rel, buf);
@@ -225,6 +177,7 @@ restart_insert:
*/
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+ metap = HashPageGetMeta(metapage);
metap->hashm_ntuples += 1;
/* Make sure this stays in sync with _hash_expandtable() */
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 69676eba953..d52f149389b 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -434,7 +434,13 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum);
pg = BufferGetPage(buf);
pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
- pageopaque->hasho_prevblkno = InvalidBlockNumber;
+
+ /*
+ * Set hasho_prevblkno with current hashm_maxbucket. This value will
+ * be used to validate cached HashMetaPageData. See
+ * _hash_getbucketbuf_from_hashkey().
+ */
+ pageopaque->hasho_prevblkno = metap->hashm_maxbucket;
pageopaque->hasho_nextblkno = InvalidBlockNumber;
pageopaque->hasho_bucket = i;
pageopaque->hasho_flag = LH_BUCKET_PAGE;
@@ -840,10 +846,14 @@ _hash_splitbucket(Relation rel,
oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
/*
- * Mark the old bucket to indicate that split is in progress. At
- * operation end, we clear split-in-progress flag.
+ * Mark the old bucket to indicate that split is in progress. (At
+ * operation end, we will clear the split-in-progress flag.) Also,
+ * for a primary bucket page, hasho_prevblkno stores the number of
+ * buckets that existed as of the last split, so we must update that
+ * value here.
*/
oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
+ oopaque->hasho_prevblkno = maxbucket;
npage = BufferGetPage(nbuf);
@@ -852,7 +862,7 @@ _hash_splitbucket(Relation rel,
* split is in progress.
*/
nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
- nopaque->hasho_prevblkno = InvalidBlockNumber;
+ nopaque->hasho_prevblkno = maxbucket;
nopaque->hasho_nextblkno = InvalidBlockNumber;
nopaque->hasho_bucket = nbucket;
nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED;
@@ -1191,3 +1201,136 @@ _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
hash_destroy(tidhtab);
}
+
+/*
+ * _hash_getcachedmetap() -- Returns cached metapage data.
+ *
+ * If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on
+ * the metapage. If not set, we'll set it before returning if we have to
+ * refresh the cache, and return with a pin but no lock on it; caller is
+ * responsible for releasing the pin.
+ *
+ * We refresh the cache if it's not initialized yet or force_refresh is true.
+ */
+HashMetaPage
+_hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
+{
+ Page page;
+
+ Assert(metabuf);
+ if (force_refresh || rel->rd_amcache == NULL)
+ {
+ char *cache;
+
+ /*
+ * It's important that we don't set rd_amcache to an invalid
+ * value. Either MemoryContextAlloc or _hash_getbuf could fail,
+ * so don't install a pointer to the newly-allocated storage in the
+ * actual relcache entry until both have succeeeded.
+ */
+ if (rel->rd_amcache == NULL)
+ cache = MemoryContextAlloc(rel->rd_indexcxt,
+ sizeof(HashMetaPageData));
+
+ /* Read the metapage. */
+ if (BufferIsValid(*metabuf))
+ LockBuffer(*metabuf, BUFFER_LOCK_SHARE);
+ else
+ *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ,
+ LH_META_PAGE);
+ page = BufferGetPage(*metabuf);
+
+ /* Populate the cache. */
+ if (rel->rd_amcache == NULL)
+ rel->rd_amcache = cache;
+ memcpy(rel->rd_amcache, HashPageGetMeta(page),
+ sizeof(HashMetaPageData));
+
+ /* Release metapage lock, but keep the pin. */
+ LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK);
+ }
+
+ return (HashMetaPage) rel->rd_amcache;
+}
+
+/*
+ * _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given
+ * hashkey.
+ *
+ * Bucket pages do not move or get removed once they are allocated. This give
+ * us an opportunity to use the previously saved metapage contents to reach
+ * the target bucket buffer, instead of reading from the metapage every time.
+ * This saves one buffer access every time we want to reach the target bucket
+ * buffer, which is very helpful savings in bufmgr traffic and contention.
+ *
+ * The access type parameter (HASH_READ or HASH_WRITE) indicates whether the
+ * bucket buffer has to be locked for reading or writing.
+ *
+ * The out parameter cachedmetap is set with metapage contents used for
+ * hashkey to bucket buffer mapping. Some callers need this info to reach the
+ * old bucket in case of bucket split, see _hash_doinsert().
+ */
+Buffer
+_hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access,
+ HashMetaPage *cachedmetap)
+{
+ HashMetaPage metap;
+ Buffer buf;
+ Buffer metabuf = InvalidBuffer;
+ Page page;
+ Bucket bucket;
+ BlockNumber blkno;
+ HashPageOpaque opaque;
+
+ /* We read from target bucket buffer, hence locking is must. */
+ Assert(access == HASH_READ || access == HASH_WRITE);
+
+ metap = _hash_getcachedmetap(rel, &metabuf, false);
+ Assert(metap != NULL);
+
+ /*
+ * Loop until we get a lock on the correct target bucket.
+ */
+ for (;;)
+ {
+ /*
+ * Compute the target bucket number, and convert to block number.
+ */
+ bucket = _hash_hashkey2bucket(hashkey,
+ metap->hashm_maxbucket,
+ metap->hashm_highmask,
+ metap->hashm_lowmask);
+
+ blkno = BUCKET_TO_BLKNO(metap, bucket);
+
+ /* Fetch the primary bucket page for the bucket */
+ buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE);
+ page = BufferGetPage(buf);
+ opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+ Assert(opaque->hasho_bucket == bucket);
+
+ /*
+ * If this bucket hasn't been split, we're done.
+ *
+ * NB: The check for InvalidBlockNumber is only needed for on-disk
+ * compatibility with indexes created before we started storing
+ * hashm_maxbucket in the primary page's hasho_prevblkno.
+ */
+ if (opaque->hasho_prevblkno == InvalidBlockNumber ||
+ opaque->hasho_prevblkno <= metap->hashm_maxbucket)
+ break;
+
+ /* Drop lock on this buffer, update cached metapage, and retry. */
+ _hash_relbuf(rel, buf);
+ metap = _hash_getcachedmetap(rel, &metabuf, true);
+ Assert(metap != NULL);
+ }
+
+ if (BufferIsValid(metabuf))
+ _hash_dropbuf(rel, metabuf);
+
+ if (cachedmetap)
+ *cachedmetap = metap;
+
+ return buf;
+}
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
index a59ad6ff707..9e5d7e4babe 100644
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -139,6 +139,7 @@ _hash_readprev(IndexScanDesc scan,
BlockNumber blkno;
Relation rel = scan->indexRelation;
HashScanOpaque so = (HashScanOpaque) scan->opaque;
+ bool haveprevblk;
blkno = (*opaquep)->hasho_prevblkno;
@@ -147,15 +148,23 @@ _hash_readprev(IndexScanDesc scan,
* comments in _hash_first to know the reason of retaining pin.
*/
if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
+ {
LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
+ haveprevblk = false;
+ }
else
+ {
_hash_relbuf(rel, *bufp);
+ haveprevblk = true;
+ }
*bufp = InvalidBuffer;
/* check for interrupts while we're not holding any buffer lock */
CHECK_FOR_INTERRUPTS();
- if (BlockNumberIsValid(blkno))
+
+ if (haveprevblk)
{
+ Assert(BlockNumberIsValid(blkno));
*bufp = _hash_getbuf(rel, blkno, HASH_READ,
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
*pagep = BufferGetPage(*bufp);
@@ -215,14 +224,9 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
ScanKey cur;
uint32 hashkey;
Bucket bucket;
- BlockNumber blkno;
- BlockNumber oldblkno = InvalidBuffer;
- bool retry = false;
Buffer buf;
- Buffer metabuf;
Page page;
HashPageOpaque opaque;
- HashMetaPage metap;
IndexTuple itup;
ItemPointer current;
OffsetNumber offnum;
@@ -277,59 +281,10 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
so->hashso_sk_hash = hashkey;
- /* Read the metapage */
- metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
- page = BufferGetPage(metabuf);
- metap = HashPageGetMeta(page);
-
- /*
- * Loop until we get a lock on the correct target bucket.
- */
- for (;;)
- {
- /*
- * Compute the target bucket number, and convert to block number.
- */
- bucket = _hash_hashkey2bucket(hashkey,
- metap->hashm_maxbucket,
- metap->hashm_highmask,
- metap->hashm_lowmask);
-
- blkno = BUCKET_TO_BLKNO(metap, bucket);
-
- /* Release metapage lock, but keep pin. */
- LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-
- /*
- * If the previous iteration of this loop locked what is still the
- * correct target bucket, we are done. Otherwise, drop any old lock
- * and lock what now appears to be the correct bucket.
- */
- if (retry)
- {
- if (oldblkno == blkno)
- break;
- _hash_relbuf(rel, buf);
- }
-
- /* Fetch the primary bucket page for the bucket */
- buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
-
- /*
- * Reacquire metapage lock and check that no bucket split has taken
- * place while we were awaiting the bucket lock.
- */
- LockBuffer(metabuf, BUFFER_LOCK_SHARE);
- oldblkno = blkno;
- retry = true;
- }
-
- /* done with the metapage */
- _hash_dropbuf(rel, metabuf);
-
+ buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL);
page = BufferGetPage(buf);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
- Assert(opaque->hasho_bucket == bucket);
+ bucket = opaque->hasho_bucket;
so->hashso_bucket_buf = buf;
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index 1a9b91f9f53..c0455851f46 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -61,10 +61,21 @@ typedef uint32 Bucket;
#define LH_PAGE_TYPE \
(LH_OVERFLOW_PAGE|LH_BUCKET_PAGE|LH_BITMAP_PAGE|LH_META_PAGE)
+/*
+ * In an overflow page, hasho_prevblkno stores the block number of the previous
+ * page in the bucket chain; in a bucket page, hasho_prevblkno stores the
+ * hashm_maxbucket value as of the last time the bucket was last split, or
+ * else as of the time the bucket was created. The latter convention is used
+ * to determine whether a cached copy of the metapage is too stale to be used
+ * without needing to lock or pin the metapage.
+ *
+ * hasho_nextblkno is always the block number of the next page in the
+ * bucket chain, or InvalidBlockNumber if there are no more such pages.
+ */
typedef struct HashPageOpaqueData
{
- BlockNumber hasho_prevblkno; /* previous ovfl (or bucket) blkno */
- BlockNumber hasho_nextblkno; /* next ovfl blkno */
+ BlockNumber hasho_prevblkno; /* see above */
+ BlockNumber hasho_nextblkno; /* see above */
Bucket hasho_bucket; /* bucket number this pg belongs to */
uint16 hasho_flag; /* page type code, see above */
uint16 hasho_page_id; /* for identification of hash indexes */
@@ -309,6 +320,11 @@ extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno,
int access, int flags);
extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel,
BlockNumber blkno, int flags);
+extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf,
+ bool force_refresh);
+extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey,
+ int access,
+ HashMetaPage *cachedmetap);
extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno,
ForkNumber forkNum);