Repair PANIC condition in hash indexes when a previous index extension attempt

failed (due to lock conflicts or out-of-space). We might have already extended the index's filesystem EOF before failing, causing the EOF to be beyond what the metapage says is the last used page. Hence the invariant maintained by the code needs to be "EOF is at or beyond last used page", not "EOF is exactly the last used page". Problem was created by my patch of 2006-11-19 that attempted to repair bug #2737. Since that was back-patched to 7.4, this needs to be as well. Per report and test case from Vlastimil Krejcir.
author: Tom Lane <tgl@sss.pgh.pa.us> 2007-04-19 20:24:28 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2007-04-19 20:24:28 +0000
commit: 0f93ebd3389ca5726edae3e5600df600f901bafb (patch)
tree: d7b215e8f14cb866dbc1d2c628f26a6ad66d00a0 /src/backend/access/hash/hashpage.c
parent: 77be1e891d88e0c43b9b97ec00d2a7c5d7796219 (diff)
download: postgresql-0f93ebd3389ca5726edae3e5600df600f901bafb.tar.gz
postgresql-0f93ebd3389ca5726edae3e5600df600f901bafb.zip
1 files changed, 108 insertions, 67 deletions
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 07b4ab870fd..aa58ffb0eb9 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.47.4.1 2006/11/19 21:33:37 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.47.4.2 2007/04/19 20:24:28 tgl Exp $
  *
  * NOTES
  *	  Postgres hash pages look like ordinary relation pages.  The opaque
@@ -35,7 +35,8 @@
 #include "utils/lsyscache.h"
 
 
-static BlockNumber _hash_alloc_buckets(Relation rel, uint32 nblocks);
+static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,
+								uint32 nblocks);
 static void _hash_splitbucket(Relation rel, Buffer metabuf,
 				  Bucket obucket, Bucket nbucket,
 				  BlockNumber start_oblkno,
@@ -103,14 +104,18 @@ _hash_droplock(Relation rel, BlockNumber whichlock, int access)
  *		requested buffer and its reference count has been incremented
  *		(ie, the buffer is "locked and pinned").
  *
- *		blkno == P_NEW is allowed, but it is caller's responsibility to
- *		ensure that only one process can extend the index at a time.
+ *		P_NEW is disallowed because this routine should only be used
+ *		to access pages that are known to be before the filesystem EOF.
+ *		Extending the index should be done with _hash_getnewbuf.
  */
 Buffer
 _hash_getbuf(Relation rel, BlockNumber blkno, int access)
 {
 	Buffer		buf;
 
+	if (blkno == P_NEW)
+		elog(ERROR, "hash AM does not use P_NEW");
+
 	buf = ReadBuffer(rel, blkno);
 
 	if (access != HASH_NOLOCK)
@@ -121,6 +126,51 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access)
 }
 
 /*
+ *	_hash_getnewbuf() -- Get a new page at the end of the index.
+ *
+ *		This has the same API as _hash_getbuf, except that we are adding
+ *		a page to the index, and hence expect the page to be past the
+ *		logical EOF.  (However, we have to support the case where it isn't,
+ *		since a prior try might have crashed after extending the filesystem
+ *		EOF but before updating the metapage to reflect the added page.)
+ *
+ *		It is caller's responsibility to ensure that only one process can
+ *		extend the index at a time.
+ *
+ *		All call sites should call _hash_pageinit on the returned page.
+ *		Also, it's difficult to imagine why access would not be HASH_WRITE.
+ */
+Buffer
+_hash_getnewbuf(Relation rel, BlockNumber blkno, int access)
+{
+	BlockNumber	nblocks = RelationGetNumberOfBlocks(rel);
+	Buffer		buf;
+
+	if (blkno == P_NEW)
+		elog(ERROR, "hash AM does not use P_NEW");
+	if (blkno > nblocks)
+		elog(ERROR, "access to noncontiguous page in hash index \"%s\"",
+			 RelationGetRelationName(rel));
+
+	/* smgr insists we use P_NEW to extend the relation */
+	if (blkno == nblocks)
+	{
+		buf = ReadBuffer(rel, P_NEW);
+		if (BufferGetBlockNumber(buf) != blkno)
+			elog(ERROR, "unexpected hash relation size: %u, should be %u",
+				 BufferGetBlockNumber(buf), blkno);
+	}
+	else
+		buf = ReadBuffer(rel, blkno);
+
+	if (access != HASH_NOLOCK)
+		LockBuffer(buf, access);
+
+	/* ref count and lock type are correct */
+	return buf;
+}
+
+/*
  *	_hash_relbuf() -- release a locked buffer.
  *
  * Lock and pin (refcount) are both dropped.  Note that either read or
@@ -252,12 +302,11 @@ _hash_metapinit(Relation rel)
 
 	/*
 	 * We initialize the metapage, the first two bucket pages, and the
-	 * first bitmap page in sequence, using P_NEW to cause smgrextend()
-	 * calls to occur.  This ensures that the smgr level has the right
-	 * idea of the physical index length.
+	 * first bitmap page in sequence, using _hash_getnewbuf to cause
+	 * smgrextend() calls to occur.  This ensures that the smgr level
+	 * has the right idea of the physical index length.
 	 */
-	metabuf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
-	Assert(BufferGetBlockNumber(metabuf) == HASH_METAPAGE);
+	metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, HASH_WRITE);
 	pg = BufferGetPage(metabuf);
 	_hash_pageinit(pg, BufferGetPageSize(metabuf));
 
@@ -311,8 +360,7 @@ _hash_metapinit(Relation rel)
 	 */
 	for (i = 0; i <= 1; i++)
 	{
-		buf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
-		Assert(BufferGetBlockNumber(buf) == BUCKET_TO_BLKNO(metap, i));
+		buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE);
 		pg = BufferGetPage(buf);
 		_hash_pageinit(pg, BufferGetPageSize(buf));
 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
@@ -360,7 +408,6 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	Bucket		old_bucket;
 	Bucket		new_bucket;
 	uint32		spare_ndx;
-	BlockNumber firstblock = InvalidBlockNumber;
 	BlockNumber start_oblkno;
 	BlockNumber start_nblkno;
 	uint32		maxbucket;
@@ -414,38 +461,14 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 		goto fail;
 
 	/*
-	 * If the split point is increasing (hashm_maxbucket's log base 2
-	 * increases), we need to allocate a new batch of bucket pages.
-	 */
-	new_bucket = metap->hashm_maxbucket + 1;
-	spare_ndx = _hash_log2(new_bucket + 1);
-	if (spare_ndx > metap->hashm_ovflpoint)
-	{
-		Assert(spare_ndx == metap->hashm_ovflpoint + 1);
-		/*
-		 * The number of buckets in the new splitpoint is equal to the
-		 * total number already in existence, i.e. new_bucket.  Currently
-		 * this maps one-to-one to blocks required, but someday we may need
-		 * a more complicated calculation here.
-		 */
-		firstblock = _hash_alloc_buckets(rel, new_bucket);
-		if (firstblock == InvalidBlockNumber)
-			goto fail;			/* can't split due to BlockNumber overflow */
-	}
-
-	/*
 	 * Determine which bucket is to be split, and attempt to lock the old
 	 * bucket.	If we can't get the lock, give up.
 	 *
 	 * The lock protects us against other backends, but not against our own
 	 * backend.  Must check for active scans separately.
-	 *
-	 * Ideally we would lock the new bucket too before proceeding, but if we
-	 * are about to cross a splitpoint then the BUCKET_TO_BLKNO mapping
-	 * isn't correct yet.  For simplicity we update the metapage first and
-	 * then lock.  This should be okay because no one else should be
-	 * trying to lock the new bucket yet...
 	 */
+	new_bucket = metap->hashm_maxbucket + 1;
+
 	old_bucket = (new_bucket & metap->hashm_lowmask);
 
 	start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
@@ -457,6 +480,45 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 		goto fail;
 
 	/*
+	 * Likewise lock the new bucket (should never fail).
+	 *
+	 * Note: it is safe to compute the new bucket's blkno here, even though
+	 * we may still need to update the BUCKET_TO_BLKNO mapping.  This is
+	 * because the current value of hashm_spares[hashm_ovflpoint] correctly
+	 * shows where we are going to put a new splitpoint's worth of buckets.
+	 */
+	start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
+
+	if (_hash_has_active_scan(rel, new_bucket))
+		elog(ERROR, "scan in progress on supposedly new bucket");
+
+	if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
+		elog(ERROR, "could not get lock on supposedly new bucket");
+
+	/*
+	 * If the split point is increasing (hashm_maxbucket's log base 2
+	 * increases), we need to allocate a new batch of bucket pages.
+	 */
+	spare_ndx = _hash_log2(new_bucket + 1);
+	if (spare_ndx > metap->hashm_ovflpoint)
+	{
+		Assert(spare_ndx == metap->hashm_ovflpoint + 1);
+		/*
+		 * The number of buckets in the new splitpoint is equal to the
+		 * total number already in existence, i.e. new_bucket.  Currently
+		 * this maps one-to-one to blocks required, but someday we may need
+		 * a more complicated calculation here.
+		 */
+		if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket))
+		{
+			/* can't split due to BlockNumber overflow */
+			_hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
+			_hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
+			goto fail;
+		}
+	}
+
+	/*
 	 * Okay to proceed with split.	Update the metapage bucket mapping
 	 * info.
 	 */
@@ -481,20 +543,6 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 		metap->hashm_ovflpoint = spare_ndx;
 	}
 
-	/* now we can compute the new bucket's primary block number */
-	start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
-
-	/* if we added a splitpoint, should match result of _hash_alloc_buckets */
-	if (firstblock != InvalidBlockNumber &&
-		firstblock != start_nblkno)
-		elog(PANIC, "unexpected hash relation size: %u, should be %u",
-			 firstblock, start_nblkno);
-
-	Assert(!_hash_has_active_scan(rel, new_bucket));
-
-	if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
-		elog(PANIC, "could not get lock on supposedly new bucket");
-
 	/*
 	 * Copy bucket mapping info now; this saves re-accessing the meta page
 	 * inside _hash_splitbucket's inner loop.  Note that once we drop the
@@ -558,23 +606,16 @@ fail:
  * for the purpose.  OTOH, adding a splitpoint is a very infrequent operation,
  * so it may not be worth worrying about.
  *
- * Returns the first block number in the new splitpoint's range, or
- * InvalidBlockNumber if allocation failed due to BlockNumber overflow.
+ * Returns TRUE if successful, or FALSE if allocation failed due to
+ * BlockNumber overflow.
  */
-static BlockNumber
-_hash_alloc_buckets(Relation rel, uint32 nblocks)
+static bool
+_hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
 {
-	BlockNumber	firstblock;
 	BlockNumber	lastblock;
 	BlockNumber	endblock;
 	char		zerobuf[BLCKSZ];
 
-	/*
-	 * Since we hold metapage lock, no one else is either splitting or
-	 * allocating a new page in _hash_getovflpage(); hence it's safe to
-	 * assume that the relation length isn't changing under us.
-	 */
-	firstblock = RelationGetNumberOfBlocks(rel);
 	lastblock = firstblock + nblocks - 1;
 
 	/*
@@ -582,12 +623,12 @@ _hash_alloc_buckets(Relation rel, uint32 nblocks)
 	 * extend the index anymore.
 	 */
 	if (lastblock < firstblock || lastblock == InvalidBlockNumber)
-		return InvalidBlockNumber;
-
-	/* Note: we assume RelationGetNumberOfBlocks did RelationOpenSmgr for us */
+		return false;
 
 	MemSet(zerobuf, 0, sizeof(zerobuf));
 
+	RelationOpenSmgr(rel);
+
 	/*
 	 * XXX If the extension results in creation of new segment files,
 	 * we have to make sure that each non-last file is correctly filled out to
@@ -604,7 +645,7 @@ _hash_alloc_buckets(Relation rel, uint32 nblocks)
 
 	smgrextend(rel->rd_smgr, lastblock, zerobuf, rel->rd_istemp);
 
-	return firstblock;
+	return true;
 }
author	Tom Lane <tgl@sss.pgh.pa.us>	2007-04-19 20:24:28 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2007-04-19 20:24:28 +0000
commit	0f93ebd3389ca5726edae3e5600df600f901bafb (patch)
tree	d7b215e8f14cb866dbc1d2c628f26a6ad66d00a0 /src/backend/access/hash/hashpage.c
parent	77be1e891d88e0c43b9b97ec00d2a7c5d7796219 (diff)
download	postgresql-0f93ebd3389ca5726edae3e5600df600f901bafb.tar.gz postgresql-0f93ebd3389ca5726edae3e5600df600f901bafb.zip