8 files changed, 279 insertions, 181 deletions
diff --git a/contrib/pageinspect/expected/hash.out b/contrib/pageinspect/expected/hash.out
index 31b67999fc8..7eb1537b29e 100644
--- a/contrib/pageinspect/expected/hash.out
+++ b/contrib/pageinspect/expected/hash.out
@@ -98,7 +98,7 @@ hash_page_stats(get_raw_page('test_hash_a_idx', 1));
 live_items      | 0
 dead_items      | 0
 page_size       | 8192
-hasho_prevblkno | 4294967295
+hasho_prevblkno | 3
 hasho_nextblkno | 4294967295
 hasho_bucket    | 0
 hasho_flag      | 2
@@ -111,7 +111,7 @@ hash_page_stats(get_raw_page('test_hash_a_idx', 2));
 live_items      | 0
 dead_items      | 0
 page_size       | 8192
-hasho_prevblkno | 4294967295
+hasho_prevblkno | 3
 hasho_nextblkno | 4294967295
 hasho_bucket    | 1
 hasho_flag      | 2
@@ -124,7 +124,7 @@ hash_page_stats(get_raw_page('test_hash_a_idx', 3));
 live_items      | 1
 dead_items      | 0
 page_size       | 8192
-hasho_prevblkno | 4294967295
+hasho_prevblkno | 3
 hasho_nextblkno | 4294967295
 hasho_bucket    | 2
 hasho_flag      | 2
@@ -137,7 +137,7 @@ hash_page_stats(get_raw_page('test_hash_a_idx', 4));
 live_items      | 0
 dead_items      | 0
 page_size       | 8192
-hasho_prevblkno | 4294967295
+hasho_prevblkno | 3
 hasho_nextblkno | 4294967295
 hasho_bucket    | 3
 hasho_flag      | 2
diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml
index 4c201e75b0d..5e6712f9cde 100644
--- a/doc/src/sgml/pageinspect.sgml
+++ b/doc/src/sgml/pageinspect.sgml
@@ -539,7 +539,7 @@ live_items      | 407
 dead_items      | 0
 page_size       | 8192
 free_size       | 8
-hasho_prevblkno | 4294967295
+hasho_prevblkno | 4096
 hasho_nextblkno | 8474
 hasho_bucket    | 0
 hasho_flag      | 66
diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README
index 01ea115f4d4..703ae982071 100644
--- a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
@@ -149,6 +149,50 @@ We choose to always lock the lower-numbered bucket first.  The metapage is
 only ever locked after all bucket locks have been taken.
 
 
+Metapage Caching
+----------------
+
+Both scanning the index and inserting tuples require locating the bucket
+where a given tuple ought to be located.  To do this, we need the bucket
+count, highmask, and lowmask from the metapage; however, it's undesirable
+for performance reasons to have to have to lock and pin the metapage for
+every such operation.  Instead, we retain a cached copy of the metapage
+in each each backend's relcache entry.  This will produce the correct
+bucket mapping as long as the target bucket hasn't been split since the
+last cache refresh.
+
+To guard against the possibility that such a split has occurred, the
+primary page of each bucket chain stores the number of buckets that
+existed as of the time the bucket was last split, or if never split as
+of the time it was created, in the space normally used for the
+previous block number (that is, hasho_prevblkno).  This doesn't cost
+anything because the primary bucket page is always the first page in
+the chain, and the previous block number is therefore always, in
+reality, InvalidBlockNumber.
+
+After computing the ostensibly-correct bucket number based on our cached
+copy of the metapage, we lock the corresponding primary bucket page and
+check whether the bucket count stored in hasho_prevblkno is greater than
+our the number of buckets stored in our cached copy of the metapage.  If
+so, the bucket has certainly been split, because the must originally
+have been less than the number of buckets that existed at that time and
+can't have increased except due to a split.  If not, the bucket can't have
+been split, because a split would have created a new bucket with a higher
+bucket number than any we'd seen previously.  In the latter case, we've
+locked the correct bucket and can proceed; in the former case, we must
+release the lock on this bucket, lock the metapage, update our cache,
+unlock the metapage, and retry.
+
+Needing to retry occasionally might seem expensive, but the number of times
+any given bucket can be split is limited to a few dozen no matter how
+many times the hash index is accessed, because the total number of
+buckets is limited to less than 2^32.  On the other hand, the number of
+times we access a bucket is unbounded and will be several orders of
+magnitude larger even in unsympathetic cases.
+
+(The metapage cache is new in v10.  Older hash indexes had the primary
+bucket page's hasho_prevblkno initialized to InvalidBuffer.)
+
 Pseudocode Algorithms
 ---------------------
 
@@ -188,17 +232,7 @@ track of available overflow pages.
 
 The reader algorithm is:
 
-	pin meta page and take buffer content lock in shared mode
-	loop:
-		compute bucket number for target hash key
-		release meta page buffer content lock
-		if (correct bucket page is already locked)
-			break
-		release any existing bucket page buffer content lock (if a concurrent
-         split happened)
-		take the buffer content lock on bucket page in shared mode
-		retake meta page buffer content lock in shared mode
-	release pin on metapage
+    lock the primary bucket page of the target bucket
 	if the target bucket is still being populated by a split:
 		release the buffer content lock on current bucket page
 		pin and acquire the buffer content lock on old bucket in shared mode
@@ -238,17 +272,7 @@ which this bucket is formed by split.
 
 The insertion algorithm is rather similar:
 
-	pin meta page and take buffer content lock in shared mode
-	loop:
-		compute bucket number for target hash key
-		release meta page buffer content lock
-		if (correct bucket page is already locked)
-			break
-		release any existing bucket page buffer content lock (if a concurrent
-         split happened)
-		take the buffer content lock on bucket page in exclusive mode
-		retake meta page buffer content lock in shared mode
-	release pin on metapage
+    lock the primary bucket page of the target bucket
 -- (so far same as reader, except for acquisition of buffer content lock in
 	exclusive mode on primary bucket page)
 	if the bucket-being-split flag is set for a bucket and pin count on it is
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index ec8ed33c708..97ad22aa6f3 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -507,28 +507,24 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Bucket		orig_maxbucket;
 	Bucket		cur_maxbucket;
 	Bucket		cur_bucket;
-	Buffer		metabuf;
+	Buffer		metabuf = InvalidBuffer;
 	HashMetaPage metap;
-	HashMetaPageData local_metapage;
+	HashMetaPage cachedmetap;
 
 	tuples_removed = 0;
 	num_index_tuples = 0;
 
 	/*
-	 * Read the metapage to fetch original bucket and tuple counts.  Also, we
-	 * keep a copy of the last-seen metapage so that we can use its
-	 * hashm_spares[] values to compute bucket page addresses.  This is a bit
-	 * hokey but perfectly safe, since the interesting entries in the spares
-	 * array cannot change under us; and it beats rereading the metapage for
-	 * each bucket.
+	 * We need a copy of the metapage so that we can use its hashm_spares[]
+	 * values to compute bucket page addresses, but a cached copy should be
+	 * good enough.  (If not, we'll detect that further down and refresh the
+	 * cache as necessary.)
 	 */
-	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-	metap = HashPageGetMeta(BufferGetPage(metabuf));
-	orig_maxbucket = metap->hashm_maxbucket;
-	orig_ntuples = metap->hashm_ntuples;
-	memcpy(&local_metapage, metap, sizeof(local_metapage));
-	/* release the lock, but keep pin */
-	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+	cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
+	Assert(cachedmetap != NULL);
+
+	orig_maxbucket = cachedmetap->hashm_maxbucket;
+	orig_ntuples = cachedmetap->hashm_ntuples;
 
 	/* Scan the buckets that we know exist */
 	cur_bucket = 0;
@@ -546,7 +542,7 @@ loop_top:
 		bool		split_cleanup = false;
 
 		/* Get address of bucket's start page */
-		bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
+		bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
 
 		blkno = bucket_blkno;
 
@@ -577,20 +573,27 @@ loop_top:
 			 * hashm_lowmask might be old enough to cause us to fail to remove
 			 * tuples left behind by the most recent split.  To prevent that,
 			 * now that the primary page of the target bucket has been locked
-			 * (and thus can't be further split), update our cached metapage
-			 * data.
+			 * (and thus can't be further split), check whether we need to
+			 * update our cached metapage data.
+			 *
+			 * NB: The check for InvalidBlockNumber is only needed for
+			 * on-disk compatibility with indexes created before we started
+			 * storing hashm_maxbucket in the primary page's hasho_prevblkno.
 			 */
-			LockBuffer(metabuf, BUFFER_LOCK_SHARE);
-			memcpy(&local_metapage, metap, sizeof(local_metapage));
-			LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+			if (bucket_opaque->hasho_prevblkno != InvalidBlockNumber &&
+				bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
+			{
+				cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
+				Assert(cachedmetap != NULL);
+			}
 		}
 
 		bucket_buf = buf;
 
 		hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
-						  local_metapage.hashm_maxbucket,
-						  local_metapage.hashm_highmask,
-						  local_metapage.hashm_lowmask, &tuples_removed,
+						  cachedmetap->hashm_maxbucket,
+						  cachedmetap->hashm_highmask,
+						  cachedmetap->hashm_lowmask, &tuples_removed,
 						  &num_index_tuples, split_cleanup,
 						  callback, callback_state);
 
@@ -600,6 +603,9 @@ loop_top:
 		cur_bucket++;
 	}
 
+	if (BufferIsInvalid(metabuf))
+		metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
+
 	/* Write-lock metapage and check for split since we started */
 	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
 	metap = HashPageGetMeta(BufferGetPage(metabuf));
@@ -607,9 +613,10 @@ loop_top:
 	if (cur_maxbucket != metap->hashm_maxbucket)
 	{
 		/* There's been a split, so process the additional bucket(s) */
-		cur_maxbucket = metap->hashm_maxbucket;
-		memcpy(&local_metapage, metap, sizeof(local_metapage));
 		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+		cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
+		Assert(cachedmetap != NULL);
+		cur_maxbucket = cachedmetap->hashm_maxbucket;
 		goto loop_top;
 	}
 
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
index 39c70d3a80f..dc63063ac1f 100644
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -32,9 +32,7 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 	Buffer		bucket_buf;
 	Buffer		metabuf;
 	HashMetaPage metap;
-	BlockNumber blkno;
-	BlockNumber oldblkno;
-	bool		retry;
+	HashMetaPage usedmetap = NULL;
 	Page		metapage;
 	Page		page;
 	HashPageOpaque pageopaque;
@@ -42,9 +40,6 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 	bool		do_expand;
 	uint32		hashkey;
 	Bucket		bucket;
-	uint32		maxbucket;
-	uint32		highmask;
-	uint32		lowmask;
 
 	/*
 	 * Get the hash key for the item (it's stored in the index tuple itself).
@@ -57,10 +52,14 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 								 * need to be consistent */
 
 restart_insert:
-	/* Read the metapage */
-	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+
+	/*
+	 * Read the metapage.  We don't lock it yet; HashMaxItemSize() will
+	 * examine pd_pagesize_version, but that can't change so we can examine
+	 * it without a lock.
+	 */
+	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
 	metapage = BufferGetPage(metabuf);
-	metap = HashPageGetMeta(metapage);
 
 	/*
 	 * Check whether the item can fit on a hash page at all. (Eventually, we
@@ -76,66 +75,17 @@ restart_insert:
 						itemsz, HashMaxItemSize(metapage)),
 			errhint("Values larger than a buffer page cannot be indexed.")));
 
-	oldblkno = InvalidBlockNumber;
-	retry = false;
-
-	/*
-	 * Loop until we get a lock on the correct target bucket.
-	 */
-	for (;;)
-	{
-		/*
-		 * Compute the target bucket number, and convert to block number.
-		 */
-		bucket = _hash_hashkey2bucket(hashkey,
-									  metap->hashm_maxbucket,
-									  metap->hashm_highmask,
-									  metap->hashm_lowmask);
-
-		blkno = BUCKET_TO_BLKNO(metap, bucket);
-
-		/*
-		 * Copy bucket mapping info now; refer the comment in
-		 * _hash_expandtable where we copy this information before calling
-		 * _hash_splitbucket to see why this is okay.
-		 */
-		maxbucket = metap->hashm_maxbucket;
-		highmask = metap->hashm_highmask;
-		lowmask = metap->hashm_lowmask;
-
-		/* Release metapage lock, but keep pin. */
-		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-
-		/*
-		 * If the previous iteration of this loop locked the primary page of
-		 * what is still the correct target bucket, we are done.  Otherwise,
-		 * drop any old lock before acquiring the new one.
-		 */
-		if (retry)
-		{
-			if (oldblkno == blkno)
-				break;
-			_hash_relbuf(rel, buf);
-		}
-
-		/* Fetch and lock the primary bucket page for the target bucket */
-		buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
-
-		/*
-		 * Reacquire metapage lock and check that no bucket split has taken
-		 * place while we were awaiting the bucket lock.
-		 */
-		LockBuffer(metabuf, BUFFER_LOCK_SHARE);
-		oldblkno = blkno;
-		retry = true;
-	}
+	/* Lock the primary bucket page for the target bucket. */
+	buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE,
+										  &usedmetap);
+	Assert(usedmetap != NULL);
 
 	/* remember the primary bucket buffer to release the pin on it at end. */
 	bucket_buf = buf;
 
 	page = BufferGetPage(buf);
 	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
-	Assert(pageopaque->hasho_bucket == bucket);
+	bucket = pageopaque->hasho_bucket;
 
 	/*
 	 * If this bucket is in the process of being split, try to finish the
@@ -151,8 +101,10 @@ restart_insert:
 		/* release the lock on bucket buffer, before completing the split. */
 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 
-		_hash_finish_split(rel, metabuf, buf, pageopaque->hasho_bucket,
-						   maxbucket, highmask, lowmask);
+		_hash_finish_split(rel, metabuf, buf, bucket,
+						   usedmetap->hashm_maxbucket,
+						   usedmetap->hashm_highmask,
+						   usedmetap->hashm_lowmask);
 
 		/* release the pin on old and meta buffer.  retry for insert. */
 		_hash_dropbuf(rel, buf);
@@ -225,6 +177,7 @@ restart_insert:
 	 */
 	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
 
+	metap = HashPageGetMeta(metapage);
 	metap->hashm_ntuples += 1;
 
 	/* Make sure this stays in sync with _hash_expandtable() */
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 69676eba953..d52f149389b 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -434,7 +434,13 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
 		buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum);
 		pg = BufferGetPage(buf);
 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
-		pageopaque->hasho_prevblkno = InvalidBlockNumber;
+
+		/*
+		 * Set hasho_prevblkno with current hashm_maxbucket. This value will
+		 * be used to validate cached HashMetaPageData. See
+		 * _hash_getbucketbuf_from_hashkey().
+		 */
+		pageopaque->hasho_prevblkno = metap->hashm_maxbucket;
 		pageopaque->hasho_nextblkno = InvalidBlockNumber;
 		pageopaque->hasho_bucket = i;
 		pageopaque->hasho_flag = LH_BUCKET_PAGE;
@@ -840,10 +846,14 @@ _hash_splitbucket(Relation rel,
 	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
 
 	/*
-	 * Mark the old bucket to indicate that split is in progress.  At
-	 * operation end, we clear split-in-progress flag.
+	 * Mark the old bucket to indicate that split is in progress.  (At
+	 * operation end, we will clear the split-in-progress flag.)  Also,
+	 * for a primary bucket page, hasho_prevblkno stores the number of
+	 * buckets that existed as of the last split, so we must update that
+	 * value here.
 	 */
 	oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
+	oopaque->hasho_prevblkno = maxbucket;
 
 	npage = BufferGetPage(nbuf);
 
@@ -852,7 +862,7 @@ _hash_splitbucket(Relation rel,
 	 * split is in progress.
 	 */
 	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
-	nopaque->hasho_prevblkno = InvalidBlockNumber;
+	nopaque->hasho_prevblkno = maxbucket;
 	nopaque->hasho_nextblkno = InvalidBlockNumber;
 	nopaque->hasho_bucket = nbucket;
 	nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED;
@@ -1191,3 +1201,136 @@ _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
 	LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
 	hash_destroy(tidhtab);
 }
+
+/*
+ *	_hash_getcachedmetap() -- Returns cached metapage data.
+ *
+ *	If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on
+ *  the metapage.  If not set, we'll set it before returning if we have to
+ *  refresh the cache, and return with a pin but no lock on it; caller is
+ *  responsible for releasing the pin.
+ *
+ *  We refresh the cache if it's not initialized yet or force_refresh is true.
+ */
+HashMetaPage
+_hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
+{
+	Page		page;
+
+	Assert(metabuf);
+	if (force_refresh || rel->rd_amcache == NULL)
+	{
+		char   *cache;
+
+		/*
+		 * It's important that we don't set rd_amcache to an invalid
+		 * value.  Either MemoryContextAlloc or _hash_getbuf could fail,
+		 * so don't install a pointer to the newly-allocated storage in the
+		 * actual relcache entry until both have succeeeded.
+		 */
+		if (rel->rd_amcache == NULL)
+			cache = MemoryContextAlloc(rel->rd_indexcxt,
+									   sizeof(HashMetaPageData));
+
+		/* Read the metapage. */
+		if (BufferIsValid(*metabuf))
+			LockBuffer(*metabuf, BUFFER_LOCK_SHARE);
+		else
+			*metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ,
+									LH_META_PAGE);
+		page = BufferGetPage(*metabuf);
+
+		/* Populate the cache. */
+		if (rel->rd_amcache == NULL)
+			rel->rd_amcache = cache;
+		memcpy(rel->rd_amcache, HashPageGetMeta(page),
+			   sizeof(HashMetaPageData));
+
+		/* Release metapage lock, but keep the pin. */
+		LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK);
+	}
+
+	return (HashMetaPage) rel->rd_amcache;
+}
+
+/*
+ *	_hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given
+ *										 hashkey.
+ *
+ *	Bucket pages do not move or get removed once they are allocated. This give
+ *	us an opportunity to use the previously saved metapage contents to reach
+ *	the target bucket buffer, instead of reading from the metapage every time.
+ *	This saves one buffer access every time we want to reach the target bucket
+ *  buffer, which is very helpful savings in bufmgr traffic and contention.
+ *
+ *	The access type parameter (HASH_READ or HASH_WRITE) indicates whether the
+ *	bucket buffer has to be locked for reading or writing.
+ *
+ *	The out parameter cachedmetap is set with metapage contents used for
+ *	hashkey to bucket buffer mapping. Some callers need this info to reach the
+ *	old bucket in case of bucket split, see _hash_doinsert().
+ */
+Buffer
+_hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access,
+								HashMetaPage *cachedmetap)
+{
+	HashMetaPage metap;
+	Buffer		buf;
+	Buffer		metabuf = InvalidBuffer;
+	Page		page;
+	Bucket		bucket;
+	BlockNumber blkno;
+	HashPageOpaque opaque;
+
+	/* We read from target bucket buffer, hence locking is must. */
+	Assert(access == HASH_READ || access == HASH_WRITE);
+
+	metap = _hash_getcachedmetap(rel, &metabuf, false);
+	Assert(metap != NULL);
+
+	/*
+	 * Loop until we get a lock on the correct target bucket.
+	 */
+	for (;;)
+	{
+		/*
+		 * Compute the target bucket number, and convert to block number.
+		 */
+		bucket = _hash_hashkey2bucket(hashkey,
+									  metap->hashm_maxbucket,
+									  metap->hashm_highmask,
+									  metap->hashm_lowmask);
+
+		blkno = BUCKET_TO_BLKNO(metap, bucket);
+
+		/* Fetch the primary bucket page for the bucket */
+		buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE);
+		page = BufferGetPage(buf);
+		opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+		Assert(opaque->hasho_bucket == bucket);
+
+		/*
+		 * If this bucket hasn't been split, we're done.
+		 *
+		 * NB: The check for InvalidBlockNumber is only needed for on-disk
+		 * compatibility with indexes created before we started storing
+		 * hashm_maxbucket in the primary page's hasho_prevblkno.
+		 */
+		if (opaque->hasho_prevblkno == InvalidBlockNumber ||
+			opaque->hasho_prevblkno <= metap->hashm_maxbucket)
+			break;
+
+		/* Drop lock on this buffer, update cached metapage, and retry. */
+		_hash_relbuf(rel, buf);
+		metap = _hash_getcachedmetap(rel, &metabuf, true);
+		Assert(metap != NULL);
+	}
+
+	if (BufferIsValid(metabuf))
+		_hash_dropbuf(rel, metabuf);
+
+	if (cachedmetap)
+		*cachedmetap = metap;
+
+	return buf;
+}
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
index a59ad6ff707..9e5d7e4babe 100644
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -139,6 +139,7 @@ _hash_readprev(IndexScanDesc scan,
 	BlockNumber blkno;
 	Relation	rel = scan->indexRelation;
 	HashScanOpaque so = (HashScanOpaque) scan->opaque;
+	bool		haveprevblk;
 
 	blkno = (*opaquep)->hasho_prevblkno;
 
@@ -147,15 +148,23 @@ _hash_readprev(IndexScanDesc scan,
 	 * comments in _hash_first to know the reason of retaining pin.
 	 */
 	if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
+	{
 		LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
+		haveprevblk = false;
+	}
 	else
+	{
 		_hash_relbuf(rel, *bufp);
+		haveprevblk = true;
+	}
 
 	*bufp = InvalidBuffer;
 	/* check for interrupts while we're not holding any buffer lock */
 	CHECK_FOR_INTERRUPTS();
-	if (BlockNumberIsValid(blkno))
+
+	if (haveprevblk)
 	{
+		Assert(BlockNumberIsValid(blkno));
 		*bufp = _hash_getbuf(rel, blkno, HASH_READ,
 							 LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
 		*pagep = BufferGetPage(*bufp);
@@ -215,14 +224,9 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 	ScanKey		cur;
 	uint32		hashkey;
 	Bucket		bucket;
-	BlockNumber blkno;
-	BlockNumber oldblkno = InvalidBuffer;
-	bool		retry = false;
 	Buffer		buf;
-	Buffer		metabuf;
 	Page		page;
 	HashPageOpaque opaque;
-	HashMetaPage metap;
 	IndexTuple	itup;
 	ItemPointer current;
 	OffsetNumber offnum;
@@ -277,59 +281,10 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 
 	so->hashso_sk_hash = hashkey;
 
-	/* Read the metapage */
-	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-	page = BufferGetPage(metabuf);
-	metap = HashPageGetMeta(page);
-
-	/*
-	 * Loop until we get a lock on the correct target bucket.
-	 */
-	for (;;)
-	{
-		/*
-		 * Compute the target bucket number, and convert to block number.
-		 */
-		bucket = _hash_hashkey2bucket(hashkey,
-									  metap->hashm_maxbucket,
-									  metap->hashm_highmask,
-									  metap->hashm_lowmask);
-
-		blkno = BUCKET_TO_BLKNO(metap, bucket);
-
-		/* Release metapage lock, but keep pin. */
-		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-
-		/*
-		 * If the previous iteration of this loop locked what is still the
-		 * correct target bucket, we are done.  Otherwise, drop any old lock
-		 * and lock what now appears to be the correct bucket.
-		 */
-		if (retry)
-		{
-			if (oldblkno == blkno)
-				break;
-			_hash_relbuf(rel, buf);
-		}
-
-		/* Fetch the primary bucket page for the bucket */
-		buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
-
-		/*
-		 * Reacquire metapage lock and check that no bucket split has taken
-		 * place while we were awaiting the bucket lock.
-		 */
-		LockBuffer(metabuf, BUFFER_LOCK_SHARE);
-		oldblkno = blkno;
-		retry = true;
-	}
-
-	/* done with the metapage */
-	_hash_dropbuf(rel, metabuf);
-
+	buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL);
 	page = BufferGetPage(buf);
 	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
-	Assert(opaque->hasho_bucket == bucket);
+	bucket = opaque->hasho_bucket;
 
 	so->hashso_bucket_buf = buf;
 
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index 1a9b91f9f53..c0455851f46 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -61,10 +61,21 @@ typedef uint32 Bucket;
 #define LH_PAGE_TYPE \
 	(LH_OVERFLOW_PAGE|LH_BUCKET_PAGE|LH_BITMAP_PAGE|LH_META_PAGE)
 
+/*
+ * In an overflow page, hasho_prevblkno stores the block number of the previous
+ * page in the bucket chain; in a bucket page, hasho_prevblkno stores the
+ * hashm_maxbucket value as of the last time the bucket was last split, or
+ * else as of the time the bucket was created.  The latter convention is used
+ * to determine whether a cached copy of the metapage is too stale to be used
+ * without needing to lock or pin the metapage.
+ *
+ * hasho_nextblkno is always the block number of the next page in the
+ * bucket chain, or InvalidBlockNumber if there are no more such pages.
+ */
 typedef struct HashPageOpaqueData
 {
-	BlockNumber hasho_prevblkno;	/* previous ovfl (or bucket) blkno */
-	BlockNumber hasho_nextblkno;	/* next ovfl blkno */
+	BlockNumber hasho_prevblkno;	/* see above */
+	BlockNumber hasho_nextblkno;	/* see above */
 	Bucket		hasho_bucket;	/* bucket number this pg belongs to */
 	uint16		hasho_flag;		/* page type code, see above */
 	uint16		hasho_page_id;	/* for identification of hash indexes */
@@ -309,6 +320,11 @@ extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno,
 			 int access, int flags);
 extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel,
 								   BlockNumber blkno, int flags);
+extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf,
+					 bool force_refresh);
+extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey,
+								int access,
+								HashMetaPage *cachedmetap);
 extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
 extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno,
 				ForkNumber forkNum);