1 files changed, 106 insertions, 46 deletions
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index e3cc00ca1a9..e8396b698d4 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.126 2007/01/17 00:17:21 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.127 2007/01/17 16:25:01 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -31,6 +31,10 @@
 /* interval for calling AbsorbFsyncRequests in mdsync */
 #define FSYNCS_PER_ABSORB		10
 
+/* special values for the segno arg to RememberFsyncRequest */
+#define FORGET_RELATION_FSYNC	(InvalidBlockNumber)
+#define FORGET_DATABASE_FSYNC	(InvalidBlockNumber-1)
+
 /*
  * On Windows, we have to interpret EACCES as possibly meaning the same as
  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
@@ -258,30 +262,7 @@ mdunlink(RelFileNode rnode, bool isRedo)
 	 * We have to clean out any pending fsync requests for the doomed relation,
 	 * else the next mdsync() will fail.
 	 */
-	if (pendingOpsTable)
-	{
-		/* standalone backend or startup process: fsync state is local */
-		RememberFsyncRequest(rnode, InvalidBlockNumber);
-	}
-	else if (IsUnderPostmaster)
-	{
-		/*
-		 * Notify the bgwriter about it.  If we fail to queue the revoke
-		 * message, we have to sleep and try again ... ugly, but hopefully
-		 * won't happen often.
-		 *
-		 * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with
-		 * an error would leave the no-longer-used file still present on
-		 * disk, which would be bad, so I'm inclined to assume that the
-		 * bgwriter will always empty the queue soon.
-		 */
-		while (!ForwardFsyncRequest(rnode, InvalidBlockNumber))
-			pg_usleep(10000L);	/* 10 msec seems a good number */
-		/*
-		 * Note we don't wait for the bgwriter to actually absorb the
-		 * revoke message; see mdsync() for the implications.
-		 */
-	}
+	ForgetRelationFsyncRequests(rnode);
 
 	path = relpath(rnode);
 
@@ -894,7 +875,8 @@ mdsync(void)
 	 * what we will do is retry the whole process after absorbing fsync
 	 * request messages again.  Since mdunlink() queues a "revoke" message
 	 * before actually unlinking, the fsync request is guaranteed to be gone
-	 * the second time if it really was this case.
+	 * the second time if it really was this case.  DROP DATABASE likewise
+	 * has to tell us to forget fsync requests before it starts deletions.
 	 */
 	do {
 		HASH_SEQ_STATUS hstat;
@@ -1043,17 +1025,58 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
  * We stuff the fsync request into the local hash table for execution
  * during the bgwriter's next checkpoint.
  *
- * segno == InvalidBlockNumber is a "revoke" request: remove any pending
- * fsync requests for the whole relation.
+ * The range of possible segment numbers is way less than the range of
+ * BlockNumber, so we can reserve high values of segno for special purposes.
+ * We define two: FORGET_RELATION_FSYNC means to drop pending fsyncs for
+ * a relation, and FORGET_DATABASE_FSYNC means to drop pending fsyncs for
+ * a whole database.  (These are a tad slow because the hash table has to be
+ * searched linearly, but it doesn't seem worth rethinking the table structure
+ * for them.)
  */
 void
 RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
 {
 	Assert(pendingOpsTable);
 
-	if (segno != InvalidBlockNumber)
+	if (segno == FORGET_RELATION_FSYNC)
+	{
+		/* Remove any pending requests for the entire relation */
+		HASH_SEQ_STATUS hstat;
+		PendingOperationEntry *entry;
+
+		hash_seq_init(&hstat, pendingOpsTable);
+		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+		{
+			if (RelFileNodeEquals(entry->tag.rnode, rnode))
+			{
+				/* Okay, delete this entry */
+				if (hash_search(pendingOpsTable, &entry->tag,
+								HASH_REMOVE, NULL) == NULL)
+					elog(ERROR, "pendingOpsTable corrupted");
+			}
+		}
+	}
+	else if (segno == FORGET_DATABASE_FSYNC)
+	{
+		/* Remove any pending requests for the entire database */
+		HASH_SEQ_STATUS hstat;
+		PendingOperationEntry *entry;
+
+		hash_seq_init(&hstat, pendingOpsTable);
+		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+		{
+			if (entry->tag.rnode.dbNode == rnode.dbNode)
+			{
+				/* Okay, delete this entry */
+				if (hash_search(pendingOpsTable, &entry->tag,
+								HASH_REMOVE, NULL) == NULL)
+					elog(ERROR, "pendingOpsTable corrupted");
+			}
+		}
+	}
+	else
 	{
-		/* Enter a request to fsync this segment */
+		/* Normal case: enter a request to fsync this segment */
 		PendingOperationTag key;
 		PendingOperationEntry *entry;
 		bool		found;
@@ -1070,29 +1093,66 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
 		if (!found)				/* new entry, so initialize it */
 			entry->failures = 0;
 	}
-	else
+}
+
+/*
+ * ForgetRelationFsyncRequests -- ensure any fsyncs for a rel are forgotten
+ */
+void
+ForgetRelationFsyncRequests(RelFileNode rnode)
+{
+	if (pendingOpsTable)
+	{
+		/* standalone backend or startup process: fsync state is local */
+		RememberFsyncRequest(rnode, FORGET_RELATION_FSYNC);
+	}
+	else if (IsUnderPostmaster)
 	{
 		/*
-		 * Remove any pending requests for the entire relation.  (This is a
-		 * tad slow but it doesn't seem worth rethinking the table structure.)
+		 * Notify the bgwriter about it.  If we fail to queue the revoke
+		 * message, we have to sleep and try again ... ugly, but hopefully
+		 * won't happen often.
+		 *
+		 * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with
+		 * an error would leave the no-longer-used file still present on
+		 * disk, which would be bad, so I'm inclined to assume that the
+		 * bgwriter will always empty the queue soon.
 		 */
-		HASH_SEQ_STATUS hstat;
-		PendingOperationEntry *entry;
+		while (!ForwardFsyncRequest(rnode, FORGET_RELATION_FSYNC))
+			pg_usleep(10000L);	/* 10 msec seems a good number */
+		/*
+		 * Note we don't wait for the bgwriter to actually absorb the
+		 * revoke message; see mdsync() for the implications.
+		 */
+	}
+}
 
-		hash_seq_init(&hstat, pendingOpsTable);
-		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-		{
-			if (RelFileNodeEquals(entry->tag.rnode, rnode))
-			{
-				/* Okay, delete this entry */
-				if (hash_search(pendingOpsTable, &entry->tag,
-								HASH_REMOVE, NULL) == NULL)
-					elog(ERROR, "pendingOpsTable corrupted");
-			}
-		}
+/*
+ * ForgetDatabaseFsyncRequests -- ensure any fsyncs for a DB are forgotten
+ */
+void
+ForgetDatabaseFsyncRequests(Oid dbid)
+{
+	RelFileNode rnode;
+
+	rnode.dbNode = dbid;
+	rnode.spcNode = 0;
+	rnode.relNode = 0;
+
+	if (pendingOpsTable)
+	{
+		/* standalone backend or startup process: fsync state is local */
+		RememberFsyncRequest(rnode, FORGET_DATABASE_FSYNC);
+	}
+	else if (IsUnderPostmaster)
+	{
+		/* see notes in ForgetRelationFsyncRequests */
+		while (!ForwardFsyncRequest(rnode, FORGET_DATABASE_FSYNC))
+			pg_usleep(10000L);	/* 10 msec seems a good number */
 	}
 }
 
+
 /*
  *	_fdvec_alloc() -- Make a MdfdVec object.
  */