freespace: Don't return blocks past the end of the main fork.

GetPageWithFreeSpace() callers assume the returned block exists in the main fork, failing with "could not read block" errors if that doesn't hold. Make that assumption reliable now. It hadn't been guaranteed, due to the weak WAL and data ordering of participating components. Most operations on the fsm fork are not WAL-logged. Relation extension is not WAL-logged. Hence, an fsm-fork block on disk can reference a main-fork block that no WAL record has initialized. That could happen after an OS crash, a replica promote, or a PITR restore. wal_log_hints makes the trouble easier to hit; a replica promote or PITR ending just after a relevant fsm-fork FPI_FOR_HINT may yield this broken state. The v16 RelationAddBlocks() mechanism also makes the trouble easier to hit, since it bulk-extends even without extension lock waiters. Commit 917dc7d2393ce680dea7a59418be9ff341df3c14 stopped trouble around truncation, but vectors involving PageIsNew() pages remained. This implementation adds a RelationGetNumberOfBlocks() call when the cached relation size doesn't confirm a block exists. We've been unable to identify a benchmark that slows materially, but this may show up as additional time in lseek(). An alternative without that overhead would be a new ReadBufferMode such that ReadBufferExtended() returns NULL after a 0-byte read, with all other errors handled normally. However, each GetFreeIndexPage() caller would then need code for the return-NULL case. Back-patch to v14, due to earlier versions not caching relation size and the absence of a pre-v16 problem report. Ronan Dunklau. Reported by Ronan Dunklau. Discussion: https://postgr.es/m/1878547.tdWV9SEqCh%40aivenlaptop
author: Noah Misch <noah@leadboat.com> 2024-04-13 08:34:20 -0700
committer: Noah Misch <noah@leadboat.com> 2024-04-13 08:35:20 -0700
commit: 7c490a18b75b895c757a5340c33e05ca44a178f9 (patch)
tree: a84fd715d481e0e825997a0492f2cd9c69e270c1 /src/backend/storage/freespace/freespace.c
parent: 268e723b12fad75a8171b2f3fcfaaa7881751911 (diff)
download: postgresql-7c490a18b75b895c757a5340c33e05ca44a178f9.tar.gz
postgresql-7c490a18b75b895c757a5340c33e05ca44a178f9.zip
1 files changed, 94 insertions, 12 deletions
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index d41ae37090a..01adf287958 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -112,6 +112,7 @@ static BlockNumber fsm_search(Relation rel, uint8 min_cat);
 static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
 							 BlockNumber start, BlockNumber end,
 							 bool *eof);
+static bool fsm_does_block_exist(Relation rel, BlockNumber blknumber);
 
 
 /******** Public API ********/
@@ -128,6 +129,9 @@ static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
  * amount of free space available on that page and then try again (see
  * RecordAndGetPageWithFreeSpace).  If InvalidBlockNumber is returned,
  * extend the relation.
+ *
+ * This can trigger FSM updates if any FSM entry is found to point to a block
+ * past the end of the relation.
  */
 BlockNumber
 GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
@@ -166,9 +170,17 @@ RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
 	 * Otherwise, search as usual.
 	 */
 	if (search_slot != -1)
-		return fsm_get_heap_blk(addr, search_slot);
-	else
-		return fsm_search(rel, search_cat);
+	{
+		BlockNumber blknum = fsm_get_heap_blk(addr, search_slot);
+
+		/*
+		 * Check that the blknum is actually in the relation. Don't try to
+		 * update the FSM in that case, just fall back to the other case
+		 */
+		if (fsm_does_block_exist(rel, blknum))
+			return blknum;
+	}
+	return fsm_search(rel, search_cat);
 }
 
 /*
@@ -297,14 +309,25 @@ FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
 		fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
 
 		/*
-		 * Truncation of a relation is WAL-logged at a higher-level, and we
-		 * will be called at WAL replay. But if checksums are enabled, we need
-		 * to still write a WAL record to protect against a torn page, if the
-		 * page is flushed to disk before the truncation WAL record. We cannot
-		 * use MarkBufferDirtyHint here, because that will not dirty the page
-		 * during recovery.
+		 * This change is non-critical, because fsm_does_block_exist() would
+		 * stop us from returning a truncated-away block.  However, since this
+		 * may remove up to SlotsPerFSMPage slots, it's nice to avoid the cost
+		 * of that many fsm_does_block_exist() rejections.  Use a full
+		 * MarkBufferDirty(), not MarkBufferDirtyHint().
 		 */
 		MarkBufferDirty(buf);
+
+		/*
+		 * WAL-log like MarkBufferDirtyHint() might have done, just to avoid
+		 * differing from the rest of the file in this respect.  This is
+		 * optional; see README mention of full page images.  XXX consider
+		 * XLogSaveBufferForHint() for even closer similarity.
+		 *
+		 * A higher-level operation calls us at WAL replay.  If we crash
+		 * before the XLOG_SMGR_TRUNCATE flushes to disk, main fork length has
+		 * not changed, and our fork remains valid.  If we crash after that
+		 * flush, redo will return here.
+		 */
 		if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
 			log_newpage_buffer(buf, false);
 
@@ -721,8 +744,15 @@ fsm_search(Relation rel, uint8 min_cat)
 									(addr.level == FSM_BOTTOM_LEVEL),
 									false);
 			if (slot == -1)
+			{
 				max_avail = fsm_get_max_avail(BufferGetPage(buf));
-			UnlockReleaseBuffer(buf);
+				UnlockReleaseBuffer(buf);
+			}
+			else
+			{
+				/* Keep the pin for possible update below */
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+			}
 		}
 		else
 			slot = -1;
@@ -734,8 +764,37 @@ fsm_search(Relation rel, uint8 min_cat)
 			 * bottom.
 			 */
 			if (addr.level == FSM_BOTTOM_LEVEL)
-				return fsm_get_heap_blk(addr, slot);
-
+			{
+				BlockNumber blkno = fsm_get_heap_blk(addr, slot);
+				Page		page;
+
+				if (fsm_does_block_exist(rel, blkno))
+				{
+					ReleaseBuffer(buf);
+					return blkno;
+				}
+
+				/*
+				 * Block is past the end of the relation.  Update FSM, and
+				 * restart from root.  The usual "advancenext" behavior is
+				 * pessimal for this rare scenario, since every later slot is
+				 * unusable in the same way.  We could zero all affected slots
+				 * on the same FSM page, but don't bet on the benefits of that
+				 * optimization justifying its compiled code bulk.
+				 */
+				page = BufferGetPage(buf);
+				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+				fsm_set_avail(page, slot, 0);
+				MarkBufferDirtyHint(buf, false);
+				UnlockReleaseBuffer(buf);
+				if (restarts++ > 10000) /* same rationale as below */
+					return InvalidBlockNumber;
+				addr = FSM_ROOT_ADDRESS;
+			}
+			else
+			{
+				ReleaseBuffer(buf);
+			}
 			addr = fsm_get_child(addr, slot);
 		}
 		else if (addr.level == FSM_ROOT_LEVEL)
@@ -903,3 +962,26 @@ fsm_vacuum_page(Relation rel, FSMAddress addr,
 
 	return max_avail;
 }
+
+
+/*
+ * Check whether a block number is past the end of the relation.  This can
+ * happen after WAL replay, if the FSM reached disk but newly-extended pages
+ * it refers to did not.
+ */
+static bool
+fsm_does_block_exist(Relation rel, BlockNumber blknumber)
+{
+	SMgrRelation smgr = RelationGetSmgr(rel);
+
+	/*
+	 * If below the cached nblocks, the block surely exists.  Otherwise, we
+	 * face a trade-off.  We opt to compare to a fresh nblocks, incurring
+	 * lseek() overhead.  The alternative would be to assume the block does
+	 * not exist, but that would cause FSM to set zero space available for
+	 * blocks that main fork extension just recorded.
+	 */
+	return ((BlockNumberIsValid(smgr->smgr_cached_nblocks[MAIN_FORKNUM]) &&
+			 blknumber < smgr->smgr_cached_nblocks[MAIN_FORKNUM]) ||
+			blknumber < RelationGetNumberOfBlocks(rel));
+}
author	Noah Misch <noah@leadboat.com>	2024-04-13 08:34:20 -0700
committer	Noah Misch <noah@leadboat.com>	2024-04-13 08:35:20 -0700
commit	7c490a18b75b895c757a5340c33e05ca44a178f9 (patch)
tree	a84fd715d481e0e825997a0492f2cd9c69e270c1 /src/backend/storage/freespace/freespace.c
parent	268e723b12fad75a8171b2f3fcfaaa7881751911 (diff)
download	postgresql-7c490a18b75b895c757a5340c33e05ca44a178f9.tar.gz postgresql-7c490a18b75b895c757a5340c33e05ca44a178f9.zip