7 files changed, 201 insertions, 684 deletions
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 6132b732f86..a8c56562f2d 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.49 2002/06/20 20:29:34 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.50 2002/08/06 02:36:34 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -258,7 +258,7 @@ ShutdownBufferPoolAccess(void)
 	/* Release any buffer context locks we are holding */
 	UnlockBuffers();
 	/* Release any buffer reference counts we are holding */
-	ResetBufferPool(false);
+	AtEOXact_Buffers(false);
 }
 
 /* -----------------------------------------------------
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index b2c19e99f47..1ca7af3b775 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.127 2002/07/02 05:47:37 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.128 2002/08/06 02:36:34 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -57,16 +57,9 @@
 #include "pgstat.h"
 
 #define BufferGetLSN(bufHdr)	\
-	(*((XLogRecPtr*)MAKE_PTR((bufHdr)->data)))
+	(*((XLogRecPtr*) MAKE_PTR((bufHdr)->data)))
 
 
-extern long int ReadBufferCount;
-extern long int ReadLocalBufferCount;
-extern long int BufferHitCount;
-extern long int LocalBufferHitCount;
-extern long int BufferFlushCount;
-extern long int LocalBufferFlushCount;
-
 static void WaitIO(BufferDesc *buf);
 static void StartBufferIO(BufferDesc *buf, bool forInput);
 static void TerminateBufferIO(BufferDesc *buf);
@@ -82,16 +75,12 @@ static Buffer ReadBufferInternal(Relation reln, BlockNumber blockNum,
 				   bool bufferLockHeld);
 static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
 			bool *foundPtr);
-static int	ReleaseBufferWithBufferLock(Buffer buffer);
 static int	BufferReplace(BufferDesc *bufHdr);
 #ifdef NOT_USED
 void		PrintBufferDescs(void);
 #endif
 
 static void write_buffer(Buffer buffer, bool unpin);
-static void drop_relfilenode_buffers(RelFileNode rnode,
-                                     bool do_local, bool do_both);
-static int release_buffer(Buffer buffer, bool havelock);
 
 /*
  * ReadBuffer -- returns a buffer containing the requested
@@ -140,7 +129,7 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
 	bool		isLocalBuf;
 
 	isExtend = (blockNum == P_NEW);
-	isLocalBuf = reln->rd_myxactonly;
+	isLocalBuf = reln->rd_istemp;
 
 	if (isLocalBuf)
 	{
@@ -684,10 +673,10 @@ ReleaseAndReadBuffer(Buffer buffer,
 /*
  * BufferSync -- Write all dirty buffers in the pool.
  *
- * This is called at checkpoint time and write out all dirty buffers.
+ * This is called at checkpoint time and writes out all dirty shared buffers.
  */
 void
-BufferSync()
+BufferSync(void)
 {
 	int			i;
 	BufferDesc *bufHdr;
@@ -780,8 +769,7 @@ BufferSync()
 			status = smgrblindwrt(DEFAULT_SMGR,
 								  bufHdr->tag.rnode,
 								  bufHdr->tag.blockNum,
-								  (char *) MAKE_PTR(bufHdr->data),
-								  true);		/* must fsync */
+								  (char *) MAKE_PTR(bufHdr->data));
 		}
 		else
 		{
@@ -908,19 +896,16 @@ ResetBufferUsage(void)
 	NDirectFileWrite = 0;
 }
 
-/* ----------------------------------------------
- *		ResetBufferPool
- *
- *		This routine is supposed to be called when a transaction aborts.
- *		It will release all the buffer pins held by the transaction.
- *		Currently, we also call it during commit if BufferPoolCheckLeak
- *		detected a problem --- in that case, isCommit is TRUE, and we
- *		only clean up buffer pin counts.
+/*
+ *		AtEOXact_Buffers - clean up at end of transaction.
  *
- * ----------------------------------------------
+ *		During abort, we need to release any buffer pins we're holding
+ *		(this cleans up in case elog interrupted a routine that pins a
+ *		buffer).  During commit, we shouldn't need to do that, but check
+ *		anyway to see if anyone leaked a buffer reference count.
  */
 void
-ResetBufferPool(bool isCommit)
+AtEOXact_Buffers(bool isCommit)
 {
 	int			i;
 
@@ -928,7 +913,16 @@ ResetBufferPool(bool isCommit)
 	{
 		if (PrivateRefCount[i] != 0)
 		{
-			BufferDesc *buf = &BufferDescriptors[i];
+			BufferDesc *buf = &(BufferDescriptors[i]);
+
+			if (isCommit)
+				elog(WARNING,
+					 "Buffer Leak: [%03d] (freeNext=%d, freePrev=%d, "
+					 "rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)",
+					 i, buf->freeNext, buf->freePrev,
+					 buf->tag.rnode.tblNode, buf->tag.rnode.relNode,
+					 buf->tag.blockNum, buf->flags,
+					 buf->refcount, PrivateRefCount[i]);
 
 			PrivateRefCount[i] = 1;		/* make sure we release shared pin */
 			LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
@@ -938,48 +932,15 @@ ResetBufferPool(bool isCommit)
 		}
 	}
 
-	ResetLocalBufferPool();
-
-	if (!isCommit)
-		smgrabort();
+	AtEOXact_LocalBuffers(isCommit);
 }
 
 /*
- * BufferPoolCheckLeak
- *
- *		check if there is buffer leak
- */
-bool
-BufferPoolCheckLeak(void)
-{
-	int			i;
-	bool		result = false;
-
-	for (i = 0; i < NBuffers; i++)
-	{
-		if (PrivateRefCount[i] != 0)
-		{
-			BufferDesc *buf = &(BufferDescriptors[i]);
-
-			elog(WARNING,
-				 "Buffer Leak: [%03d] (freeNext=%d, freePrev=%d, \
-rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)",
-				 i, buf->freeNext, buf->freePrev,
-				 buf->tag.rnode.tblNode, buf->tag.rnode.relNode,
-				 buf->tag.blockNum, buf->flags,
-				 buf->refcount, PrivateRefCount[i]);
-			result = true;
-		}
-	}
-	return result;
-}
-
-/* ------------------------------------------------
  * FlushBufferPool
  *
- * Flush all dirty blocks in buffer pool to disk
- * at the checkpoint time
- * ------------------------------------------------
+ * Flush all dirty blocks in buffer pool to disk at the checkpoint time.
+ * Local relations do not participate in checkpoints, so they don't need to be
+ * flushed.
  */
 void
 FlushBufferPool(void)
@@ -989,16 +950,13 @@ FlushBufferPool(void)
 }
 
 /*
- * At the commit time we have to flush local buffer pool only
+ * Do whatever is needed to prepare for commit at the bufmgr and smgr levels
  */
 void
 BufmgrCommit(void)
 {
-	LocalBufferSync();
+	/* Nothing to do in bufmgr anymore... */
 
-	/*
-	 * All files created in current transaction will be fsync-ed
-	 */
 	smgrcommit();
 }
 
@@ -1051,15 +1009,15 @@ BufferReplace(BufferDesc *bufHdr)
 
 	if (reln != (Relation) NULL)
 	{
-		status = smgrwrite(DEFAULT_SMGR, reln, bufHdr->tag.blockNum,
+		status = smgrwrite(DEFAULT_SMGR, reln,
+						   bufHdr->tag.blockNum,
 						   (char *) MAKE_PTR(bufHdr->data));
 	}
 	else
 	{
 		status = smgrblindwrt(DEFAULT_SMGR, bufHdr->tag.rnode,
 							  bufHdr->tag.blockNum,
-							  (char *) MAKE_PTR(bufHdr->data),
-							  false);	/* no fsync */
+							  (char *) MAKE_PTR(bufHdr->data));
 	}
 
 	/* drop relcache refcnt incremented by RelationNodeCacheGetRelation */
@@ -1091,31 +1049,55 @@ RelationGetNumberOfBlocks(Relation relation)
 {
 	/*
 	 * relation->rd_nblocks should be accurate already if the relation is
-	 * myxactonly.	(XXX how safe is that really?)	Don't call smgr on a
-	 * view, either.
+	 * new or temp, because no one else should be modifying it.  Otherwise
+	 * we need to ask the smgr for the current physical file length.
+	 *
+	 * Don't call smgr on a view, either.
 	 */
 	if (relation->rd_rel->relkind == RELKIND_VIEW)
 		relation->rd_nblocks = 0;
-	else if (!relation->rd_myxactonly)
+	else if (!relation->rd_isnew && !relation->rd_istemp)
 		relation->rd_nblocks = smgrnblocks(DEFAULT_SMGR, relation);
 	return relation->rd_nblocks;
 }
 
-/*
- * drop_relfilenode_buffers -- common functionality for
- *                             DropRelationBuffers and
- *                             DropRelFileNodeBuffers
+/* ---------------------------------------------------------------------
+ *		DropRelationBuffers
  *
- *		XXX currently it sequentially searches the buffer pool, should be
- *		changed to more clever ways of searching.
+ *		This function removes all the buffered pages for a relation
+ *		from the buffer pool.  Dirty pages are simply dropped, without
+ *		bothering to write them out first.	This is NOT rollback-able,
+ *		and so should be used only with extreme caution!
+ *
+ *		We assume that the caller holds an exclusive lock on the relation,
+ *		which should assure that no new buffers will be acquired for the rel
+ *		meanwhile.
+ * --------------------------------------------------------------------
  */
-static void
-drop_relfilenode_buffers(RelFileNode rnode, bool do_local, bool do_both)
+void
+DropRelationBuffers(Relation rel)
+{
+	DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp);
+}
+
+/* ---------------------------------------------------------------------
+ *		DropRelFileNodeBuffers
+ *
+ *		This is the same as DropRelationBuffers, except that the target
+ *		relation is specified by RelFileNode and temp status.
+ *
+ *		This is NOT rollback-able.	One legitimate use is to clear the
+ *		buffer cache of buffers for a relation that is being deleted
+ *		during transaction abort.
+ * --------------------------------------------------------------------
+ */
+void
+DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
 {
 	int			i;
 	BufferDesc *bufHdr;
 
-	if (do_local)
+	if (istemp)
 	{
 		for (i = 0; i < NLocBuffer; i++)
 		{
@@ -1128,8 +1110,7 @@ drop_relfilenode_buffers(RelFileNode rnode, bool do_local, bool do_both)
 				bufHdr->tag.rnode.relNode = InvalidOid;
 			}
 		}
-		if (!do_both)
-			return;
+		return;
 	}
 
 	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
@@ -1160,18 +1141,19 @@ recheck:
 			bufHdr->cntxDirty = false;
 
 			/*
-			 * Release any refcount we may have.
-			 *
-			 * This is very probably dead code, and if it isn't then it's
-			 * probably wrong.	I added the Assert to find out --- tgl
-			 * 11/99.
+			 * Release any refcount we may have.  If someone else has a
+			 * pin on the buffer, we got trouble.
 			 */
 			if (!(bufHdr->flags & BM_FREE))
 			{
-				/* Assert checks that buffer will actually get freed! */
-				Assert(PrivateRefCount[i - 1] == 1 &&
-					   bufHdr->refcount == 1);
-				ReleaseBufferWithBufferLock(i);
+				/* the sole pin should be ours */
+				if (bufHdr->refcount != 1 || PrivateRefCount[i - 1] == 0)
+					elog(FATAL, "DropRelFileNodeBuffers: block %u is referenced (private %ld, global %d)",
+						 bufHdr->tag.blockNum,
+						 PrivateRefCount[i - 1], bufHdr->refcount);
+				/* Make sure it will be released */
+				PrivateRefCount[i - 1] = 1;
+				UnpinBuffer(bufHdr);
 			}
 
 			/*
@@ -1185,43 +1167,6 @@ recheck:
 }
 
 /* ---------------------------------------------------------------------
- *		DropRelationBuffers
- *
- *		This function removes all the buffered pages for a relation
- *		from the buffer pool.  Dirty pages are simply dropped, without
- *		bothering to write them out first.	This is NOT rollback-able,
- *		and so should be used only with extreme caution!
- *
- *		We assume that the caller holds an exclusive lock on the relation,
- *		which should assure that no new buffers will be acquired for the rel
- *		meanwhile.
- * --------------------------------------------------------------------
- */
-void
-DropRelationBuffers(Relation rel)
-{
-	drop_relfilenode_buffers(rel->rd_node, rel->rd_myxactonly, false);
-}
-
-/* ---------------------------------------------------------------------
- *		DropRelFileNodeBuffers
- *
- *		This is the same as DropRelationBuffers, except that the target
- *		relation is specified by RelFileNode.
- *
- *		This is NOT rollback-able.	One legitimate use is to clear the
- *		buffer cache of buffers for a relation that is being deleted
- *		during transaction abort.
- * --------------------------------------------------------------------
- */
-void
-DropRelFileNodeBuffers(RelFileNode rnode)
-{
-	/* We have to search both local and shared buffers... */
-	drop_relfilenode_buffers(rnode, true, true);
-}
-
-/* ---------------------------------------------------------------------
  *		DropBuffers
  *
  *		This function removes all the buffers in the buffer cache for a
@@ -1296,7 +1241,7 @@ recheck:
  */
 #ifdef NOT_USED
 void
-PrintBufferDescs()
+PrintBufferDescs(void)
 {
 	int			i;
 	BufferDesc *buf = BufferDescriptors;
@@ -1331,7 +1276,7 @@ blockNum=%u, flags=0x%x, refcount=%d %ld)",
 
 #ifdef NOT_USED
 void
-PrintPinnedBufs()
+PrintPinnedBufs(void)
 {
 	int			i;
 	BufferDesc *buf = BufferDescriptors;
@@ -1351,33 +1296,6 @@ blockNum=%u, flags=0x%x, refcount=%d %ld)",
 }
 #endif
 
-/*
- * BufferPoolBlowaway
- *
- * this routine is solely for the purpose of experiments -- sometimes
- * you may want to blowaway whatever is left from the past in buffer
- * pool and start measuring some performance with a clean empty buffer
- * pool.
- */
-#ifdef NOT_USED
-void
-BufferPoolBlowaway()
-{
-	int			i;
-
-	BufferSync();
-	for (i = 1; i <= NBuffers; i++)
-	{
-		if (BufferIsValid(i))
-		{
-			while (BufferIsValid(i))
-				ReleaseBuffer(i);
-		}
-		BufTableDelete(&BufferDescriptors[i - 1]);
-	}
-}
-#endif
-
 /* ---------------------------------------------------------------------
  *		FlushRelationBuffers
  *
@@ -1428,7 +1346,7 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
 	XLogRecPtr	recptr;
 	int			status;
 
-	if (rel->rd_myxactonly)
+	if (rel->rd_istemp)
 	{
 		for (i = 0; i < NLocBuffer; i++)
 		{
@@ -1544,12 +1462,14 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
 	return 0;
 }
 
+#undef ReleaseBuffer
+
 /*
- * release_buffer -- common functionality for
- *                   ReleaseBuffer and ReleaseBufferWithBufferLock
+ * ReleaseBuffer -- remove the pin on a buffer without
+ *		marking it dirty.
  */
-static int
-release_buffer(Buffer buffer, bool havelock)
+int
+ReleaseBuffer(Buffer buffer)
 {
 	BufferDesc *bufHdr;
 
@@ -1570,41 +1490,14 @@ release_buffer(Buffer buffer, bool havelock)
 		PrivateRefCount[buffer - 1]--;
 	else
 	{
-		if (!havelock)
-			LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-
+		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 		UnpinBuffer(bufHdr);
-
-		if (!havelock)
-			LWLockRelease(BufMgrLock);
+		LWLockRelease(BufMgrLock);
 	}
 
 	return STATUS_OK;
 }
 
-#undef ReleaseBuffer
-
-/*
- * ReleaseBuffer -- remove the pin on a buffer without
- *		marking it dirty.
- */
-int
-ReleaseBuffer(Buffer buffer)
-{
-	return release_buffer(buffer, false);
-}
-
-/*
- * ReleaseBufferWithBufferLock
- *		Same as ReleaseBuffer except we hold the bufmgr lock
- */
-static int
-ReleaseBufferWithBufferLock(Buffer buffer)
-{
-	return release_buffer(buffer, true);
-}
-
-
 #ifdef NOT_USED
 void
 IncrBufferRefCount_Debug(char *file, int line, Buffer buffer)
@@ -1847,10 +1740,13 @@ SetBufferCommitInfoNeedsSave(Buffer buffer)
 	BufferDesc *bufHdr;
 
 	if (BufferIsLocal(buffer))
+	{
+		WriteLocalBuffer(buffer, false);
 		return;
+	}
 
 	if (BAD_BUFFER_ID(buffer))
-		return;
+		elog(ERROR, "SetBufferCommitInfoNeedsSave: bad buffer %d", buffer);
 
 	bufHdr = &BufferDescriptors[buffer - 1];
 
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index d5edc570b6e..50168c8b306 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -1,48 +1,37 @@
 /*-------------------------------------------------------------------------
  *
  * localbuf.c
- *	  local buffer manager. Fast buffer manager for temporary tables
- *	  or special cases when the operation is not visible to other backends.
- *
- *	  When a relation is being created, the descriptor will have rd_islocal
- *	  set to indicate that the local buffer manager should be used. During
- *	  the same transaction the relation is being created, any inserts or
- *	  selects from the newly created relation will use the local buffer
- *	  pool. rd_islocal is reset at the end of a transaction (commit/abort).
- *	  This is useful for queries like SELECT INTO TABLE and create index.
+ *	  local buffer manager. Fast buffer manager for temporary tables,
+ *	  which never need to be WAL-logged or checkpointed, etc.
  *
  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994-5, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/localbuf.c,v 1.44 2002/06/20 20:29:34 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/localbuf.c,v 1.45 2002/08/06 02:36:34 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
-#include <sys/types.h>
-#include <sys/file.h>
-#include <math.h>
-#include <signal.h>
-
-#include "executor/execdebug.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "storage/smgr.h"
 #include "utils/relcache.h"
 
-extern long int LocalBufferFlushCount;
 
+/*#define LBDEBUG*/
+
+/* should be a GUC parameter some day */
 int			NLocBuffer = 64;
+
 BufferDesc *LocalBufferDescriptors = NULL;
 Block	   *LocalBufferBlockPointers = NULL;
 long	   *LocalRefCount = NULL;
 
 static int	nextFreeLocalBuf = 0;
 
-/*#define LBDEBUG*/
 
 /*
  * LocalBufferAlloc -
@@ -61,11 +50,11 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
 			reln->rd_node.relNode &&
 			LocalBufferDescriptors[i].tag.blockNum == blockNum)
 		{
-
 #ifdef LBDEBUG
 			fprintf(stderr, "LB ALLOC (%u,%d) %d\n",
 					RelationGetRelid(reln), blockNum, -i - 1);
 #endif
+
 			LocalRefCount[i]++;
 			*foundPtr = TRUE;
 			return &LocalBufferDescriptors[i];
@@ -94,14 +83,17 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
 		elog(ERROR, "no empty local buffer.");
 
 	/*
-	 * this buffer is not referenced but it might still be dirty (the last
-	 * transaction to touch it doesn't need its contents but has not
-	 * flushed it).  if that's the case, write it out before reusing it!
+	 * this buffer is not referenced but it might still be dirty.
+	 * if that's the case, write it out before reusing it!
 	 */
 	if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty)
 	{
 		Relation	bufrel = RelationNodeCacheGetRelation(bufHdr->tag.rnode);
 
+		/*
+		 * The relcache is not supposed to throw away temp rels, so this
+		 * should always succeed.
+		 */
 		Assert(bufrel != NULL);
 
 		/* flush this page */
@@ -114,25 +106,18 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
 	}
 
 	/*
-	 * it's all ours now.
-	 *
-	 * We need not in tblNode currently but will in future I think, when
-	 * we'll give up rel->rd_fd to fmgr cache.
-	 */
-	bufHdr->tag.rnode = reln->rd_node;
-	bufHdr->tag.blockNum = blockNum;
-	bufHdr->flags &= ~BM_DIRTY;
-	bufHdr->cntxDirty = false;
-
-	/*
 	 * lazy memory allocation: allocate space on first use of a buffer.
+	 *
+	 * Note this path cannot be taken for a buffer that was previously
+	 * in use, so it's okay to do it (and possibly error out) before
+	 * marking the buffer as valid.
 	 */
 	if (bufHdr->data == (SHMEM_OFFSET) 0)
 	{
 		char	   *data = (char *) malloc(BLCKSZ);
 
 		if (data == NULL)
-			elog(FATAL, "Out of memory in LocalBufferAlloc");
+			elog(ERROR, "Out of memory in LocalBufferAlloc");
 
 		/*
 		 * This is a bit of a hack: bufHdr->data needs to be a shmem
@@ -147,13 +132,24 @@ LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
 		LocalBufferBlockPointers[-(bufHdr->buf_id + 2)] = (Block) data;
 	}
 
+	/*
+	 * it's all ours now.
+	 *
+	 * We need not in tblNode currently but will in future I think, when
+	 * we'll give up rel->rd_fd to fmgr cache.
+	 */
+	bufHdr->tag.rnode = reln->rd_node;
+	bufHdr->tag.blockNum = blockNum;
+	bufHdr->flags &= ~BM_DIRTY;
+	bufHdr->cntxDirty = false;
+
 	*foundPtr = FALSE;
 	return bufHdr;
 }
 
 /*
  * WriteLocalBuffer -
- *	  writes out a local buffer
+ *	  writes out a local buffer (actually, just marks it dirty)
  */
 void
 WriteLocalBuffer(Buffer buffer, bool release)
@@ -180,7 +176,7 @@ WriteLocalBuffer(Buffer buffer, bool release)
  * InitLocalBuffer -
  *	  init the local buffer cache. Since most queries (esp. multi-user ones)
  *	  don't involve local buffers, we delay allocating actual memory for the
- *	  buffer until we need it.
+ *	  buffers until we need them; just make the buffer headers here.
  */
 void
 InitLocalBuffer(void)
@@ -211,65 +207,30 @@ InitLocalBuffer(void)
 }
 
 /*
- * LocalBufferSync
- *
- * Flush all dirty buffers in the local buffer cache at commit time.
- * Since the buffer cache is only used for keeping relations visible
- * during a transaction, we will not need these buffers again.
+ * AtEOXact_LocalBuffers - clean up at end of transaction.
  *
- * Note that we have to *flush* local buffers because of them are not
- * visible to checkpoint makers. But we can skip XLOG flush check.
+ * This is just like AtEOXact_Buffers, but for local buffers.
  */
 void
-LocalBufferSync(void)
+AtEOXact_LocalBuffers(bool isCommit)
 {
 	int			i;
 
 	for (i = 0; i < NLocBuffer; i++)
 	{
-		BufferDesc *buf = &LocalBufferDescriptors[i];
-		Relation	bufrel;
-
-		if (buf->flags & BM_DIRTY || buf->cntxDirty)
+		if (LocalRefCount[i] != 0)
 		{
-#ifdef LBDEBUG
-			fprintf(stderr, "LB SYNC %d\n", -i - 1);
-#endif
-			bufrel = RelationNodeCacheGetRelation(buf->tag.rnode);
-
-			Assert(bufrel != NULL);
+			BufferDesc *buf = &(LocalBufferDescriptors[i]);
 
-			smgrwrite(DEFAULT_SMGR, bufrel, buf->tag.blockNum,
-					  (char *) MAKE_PTR(buf->data));
-			smgrmarkdirty(DEFAULT_SMGR, bufrel, buf->tag.blockNum);
-			LocalBufferFlushCount++;
+			if (isCommit)
+				elog(WARNING,
+					 "Local Buffer Leak: [%03d] (rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)",
+					 i,
+					 buf->tag.rnode.tblNode, buf->tag.rnode.relNode,
+					 buf->tag.blockNum, buf->flags,
+					 buf->refcount, LocalRefCount[i]);
 
-			/* drop relcache refcount from RelationNodeCacheGetRelation */
-			RelationDecrementReferenceCount(bufrel);
-
-			buf->flags &= ~BM_DIRTY;
-			buf->cntxDirty = false;
+			LocalRefCount[i] = 0;
 		}
 	}
-
-	MemSet(LocalRefCount, 0, sizeof(long) * NLocBuffer);
-	nextFreeLocalBuf = 0;
-}
-
-void
-ResetLocalBufferPool(void)
-{
-	int			i;
-
-	for (i = 0; i < NLocBuffer; i++)
-	{
-		BufferDesc *buf = &LocalBufferDescriptors[i];
-
-		buf->tag.rnode.relNode = InvalidOid;
-		buf->flags &= ~BM_DIRTY;
-		buf->cntxDirty = false;
-	}
-
-	MemSet(LocalRefCount, 0, sizeof(long) * NLocBuffer);
-	nextFreeLocalBuf = 0;
 }
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 391a078e602..8be2ed219b9 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.92 2002/06/20 20:29:34 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.93 2002/08/06 02:36:34 tgl Exp $
  *
  * NOTES:
  *
@@ -119,8 +119,7 @@ typedef struct vfd
 	unsigned short fdstate;		/* bitflags for VFD's state */
 
 /* these are the assigned bits in fdstate: */
-#define FD_DIRTY		(1 << 0)	/* written to, but not yet fsync'd */
-#define FD_TEMPORARY	(1 << 1)	/* should be unlinked when closed */
+#define FD_TEMPORARY	(1 << 0)	/* should be unlinked when closed */
 
 	File		nextFree;		/* link to next free VFD, if in freelist */
 	File		lruMoreRecently;	/* doubly linked recency-of-use list */
@@ -396,15 +395,6 @@ LruDelete(File file)
 	vfdP->seekPos = (long) lseek(vfdP->fd, 0L, SEEK_CUR);
 	Assert(vfdP->seekPos != -1L);
 
-	/* if we have written to the file, sync it before closing */
-	if (vfdP->fdstate & FD_DIRTY)
-	{
-		if (pg_fsync(vfdP->fd))
-			elog(LOG, "LruDelete: failed to fsync %s: %m",
-				 vfdP->fileName);
-		vfdP->fdstate &= ~FD_DIRTY;
-	}
-
 	/* close the file */
 	if (close(vfdP->fd))
 		elog(LOG, "LruDelete: failed to close %s: %m",
@@ -725,17 +715,8 @@ fileNameOpenFile(FileName fileName,
 	/* Saved flags are adjusted to be OK for re-opening file */
 	vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
 	vfdP->fileMode = fileMode;
-
 	vfdP->seekPos = 0;
-
-	/*
-	 * Have to fsync file on commit. Alternative way - log file creation
-	 * and fsync log before actual file creation.
-	 */
-	if (fileFlags & O_CREAT)
-		vfdP->fdstate = FD_DIRTY;
-	else
-		vfdP->fdstate = 0x0;
+	vfdP->fdstate = 0x0;
 
 	return file;
 }
@@ -841,15 +822,6 @@ FileClose(File file)
 		/* remove the file from the lru ring */
 		Delete(file);
 
-		/* if we did any writes, sync the file before closing */
-		if (vfdP->fdstate & FD_DIRTY)
-		{
-			if (pg_fsync(vfdP->fd))
-				elog(LOG, "FileClose: failed to fsync %s: %m",
-					 vfdP->fileName);
-			vfdP->fdstate &= ~FD_DIRTY;
-		}
-
 		/* close the file */
 		if (close(vfdP->fd))
 			elog(LOG, "FileClose: failed to close %s: %m",
@@ -1022,108 +994,11 @@ FileTruncate(File file, long offset)
 	DO_DB(elog(LOG, "FileTruncate %d (%s)",
 			   file, VfdCache[file].fileName));
 
-	FileSync(file);
 	FileAccess(file);
 	returnCode = ftruncate(VfdCache[file].fd, (size_t) offset);
 	return returnCode;
 }
 
-/*
- * FileSync --- if a file is marked as dirty, fsync it.
- *
- * The FD_DIRTY bit is slightly misnamed: it doesn't mean that we need to
- * write the file, but that we *have* written it and need to execute an
- * fsync() to ensure the changes are down on disk before we mark the current
- * transaction committed.
- *
- * FD_DIRTY is set by FileWrite or by an explicit FileMarkDirty() call.
- * It is cleared after successfully fsync'ing the file.  FileClose() will
- * fsync a dirty File that is about to be closed, since there will be no
- * other place to remember the need to fsync after the VFD is gone.
- *
- * Note that the DIRTY bit is logically associated with the actual disk file,
- * not with any particular kernel FD we might have open for it.  We assume
- * that fsync will force out any dirty buffers for that file, whether or not
- * they were written through the FD being used for the fsync call --- they
- * might even have been written by some other backend!
- *
- * Note also that LruDelete currently fsyncs a dirty file that it is about
- * to close the kernel file descriptor for.  The idea there is to avoid
- * having to re-open the kernel descriptor later.  But it's not real clear
- * that this is a performance win; we could end up fsyncing the same file
- * multiple times in a transaction, which would probably cost more time
- * than is saved by avoiding an open() call.  This should be studied.
- *
- * This routine used to think it could skip the fsync if the file is
- * physically closed, but that is now WRONG; see comments for FileMarkDirty.
- */
-int
-FileSync(File file)
-{
-	int			returnCode;
-
-	Assert(FileIsValid(file));
-
-	if (!(VfdCache[file].fdstate & FD_DIRTY))
-	{
-		/* Need not sync if file is not dirty. */
-		returnCode = 0;
-	}
-	else if (!enableFsync)
-	{
-		/* Don't force the file open if pg_fsync isn't gonna sync it. */
-		returnCode = 0;
-		VfdCache[file].fdstate &= ~FD_DIRTY;
-	}
-	else
-	{
-		/*
-		 * We don't use FileAccess() because we don't want to force the
-		 * file to the front of the LRU ring; we aren't expecting to
-		 * access it again soon.
-		 */
-		if (FileIsNotOpen(file))
-		{
-			returnCode = LruInsert(file);
-			if (returnCode != 0)
-				return returnCode;
-		}
-		returnCode = pg_fsync(VfdCache[file].fd);
-		if (returnCode == 0)
-			VfdCache[file].fdstate &= ~FD_DIRTY;
-	}
-
-	return returnCode;
-}
-
-/*
- * FileMarkDirty --- mark a file as needing fsync at transaction commit.
- *
- * Since FileWrite marks the file dirty, this routine is not needed in
- * normal use.	It is called when the buffer manager detects that some other
- * backend has written out a shared buffer that this backend dirtied (but
- * didn't write) in the current xact.  In that scenario, we need to fsync
- * the file before we can commit.  We cannot assume that the other backend
- * has fsync'd the file yet; we need to do our own fsync to ensure that
- * (a) the disk page is written and (b) this backend's commit is delayed
- * until the write is complete.
- *
- * Note we are assuming that an fsync issued by this backend will write
- * kernel disk buffers that were dirtied by another backend.  Furthermore,
- * it doesn't matter whether we currently have the file physically open;
- * we must fsync even if we have to re-open the file to do it.
- */
-void
-FileMarkDirty(File file)
-{
-	Assert(FileIsValid(file));
-
-	DO_DB(elog(LOG, "FileMarkDirty: %d (%s)",
-			   file, VfdCache[file].fileName));
-
-	VfdCache[file].fdstate |= FD_DIRTY;
-}
-
 
 /*
  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
@@ -1142,7 +1017,6 @@ FileMarkDirty(File file)
  *
  * Ideally this should be the *only* direct call of fopen() in the backend.
  */
-
 FILE *
 AllocateFile(char *name, char *mode)
 {
@@ -1229,12 +1103,6 @@ closeAllVfds(void)
  * exit (it doesn't particularly care which).  All still-open temporary-file
  * VFDs are closed, which also causes the underlying files to be deleted.
  * Furthermore, all "allocated" stdio files are closed.
- *
- * This routine is not involved in fsync'ing non-temporary files at xact
- * commit; that is done by FileSync under control of the buffer manager.
- * During a commit, that is done *before* control gets here.  If we still
- * have any needs-fsync bits set when we get here, we assume this is abort
- * and clear them.
  */
 void
 AtEOXact_Files(void)
@@ -1249,8 +1117,6 @@ AtEOXact_Files(void)
 			if ((VfdCache[i].fdstate & FD_TEMPORARY) &&
 				VfdCache[i].fileName != NULL)
 				FileClose(i);
-			else
-				VfdCache[i].fdstate &= ~FD_DIRTY;
 		}
 	}
 
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 978d85d4868..25051a9799c 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.91 2002/06/20 20:29:35 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.92 2002/08/06 02:36:34 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -381,16 +381,7 @@ mdclose_fd(int fd)
 
 		/* if not closed already */
 		if (v->mdfd_vfd >= 0)
-		{
-			/*
-			 * We sync the file descriptor so that we don't need to reopen
-			 * it at transaction commit to force changes to disk.  (This
-			 * is not really optional, because we are about to forget that
-			 * the file even exists...)
-			 */
-			FileSync(v->mdfd_vfd);
 			FileClose(v->mdfd_vfd);
-		}
 		/* Now free vector */
 		v = v->mdfd_chain;
 		if (ov != &Md_fdvec[fd])
@@ -403,16 +394,7 @@ mdclose_fd(int fd)
 	if (v != (MdfdVec *) NULL)
 	{
 		if (v->mdfd_vfd >= 0)
-		{
-			/*
-			 * We sync the file descriptor so that we don't need to reopen
-			 * it at transaction commit to force changes to disk.  (This
-			 * is not really optional, because we are about to forget that
-			 * the file even exists...)
-			 */
-			FileSync(v->mdfd_vfd);
 			FileClose(v->mdfd_vfd);
-		}
 	}
 #endif
 
@@ -498,55 +480,15 @@ mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
 }
 
 /*
- *	mdflush() -- Synchronously write a block to disk.
- *
- *		This is exactly like mdwrite(), but doesn't return until the file
- *		system buffer cache has been flushed.
- */
-int
-mdflush(Relation reln, BlockNumber blocknum, char *buffer)
-{
-	int			status;
-	long		seekpos;
-	MdfdVec    *v;
-
-	v = _mdfd_getseg(reln, blocknum);
-
-#ifndef LET_OS_MANAGE_FILESIZE
-	seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
-#ifdef DIAGNOSTIC
-	if (seekpos >= BLCKSZ * RELSEG_SIZE)
-		elog(FATAL, "seekpos too big!");
-#endif
-#else
-	seekpos = (long) (BLCKSZ * (blocknum));
-#endif
-
-	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
-		return SM_FAIL;
-
-	/* write and sync the block */
-	status = SM_SUCCESS;
-	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
-		|| FileSync(v->mdfd_vfd) < 0)
-		status = SM_FAIL;
-
-	return status;
-}
-
-/*
  *	mdblindwrt() -- Write a block to disk blind.
  *
- *		We have to be able to do this using only the name and OID of
- *		the database and relation in which the block belongs.  Otherwise
- *		this is much like mdwrite().  If dofsync is TRUE, then we fsync
- *		the file, making it more like mdflush().
+ *		We have to be able to do this using only the rnode of the relation
+ *		in which the block belongs.  Otherwise this is much like mdwrite().
  */
 int
 mdblindwrt(RelFileNode rnode,
 		   BlockNumber blkno,
-		   char *buffer,
-		   bool dofsync)
+		   char *buffer)
 {
 	int			status;
 	long		seekpos;
@@ -568,7 +510,6 @@ mdblindwrt(RelFileNode rnode,
 #endif
 
 	errno = 0;
-
 	if (lseek(fd, seekpos, SEEK_SET) != seekpos)
 	{
 		elog(LOG, "mdblindwrt: lseek(%ld) failed: %m", seekpos);
@@ -578,7 +519,7 @@ mdblindwrt(RelFileNode rnode,
 
 	status = SM_SUCCESS;
 
-	/* write and optionally sync the block */
+	/* write the block */
 	errno = 0;
 	if (write(fd, buffer, BLCKSZ) != BLCKSZ)
 	{
@@ -599,54 +540,6 @@ mdblindwrt(RelFileNode rnode,
 }
 
 /*
- *	mdmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync).
- *
- *		Returns SM_SUCCESS or SM_FAIL.
- */
-int
-mdmarkdirty(Relation reln, BlockNumber blkno)
-{
-	MdfdVec    *v;
-
-	v = _mdfd_getseg(reln, blkno);
-
-	FileMarkDirty(v->mdfd_vfd);
-
-	return SM_SUCCESS;
-}
-
-/*
- *	mdblindmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync).
- *
- *		We have to be able to do this using only the name and OID of
- *		the database and relation in which the block belongs.  Otherwise
- *		this is much like mdmarkdirty().  However, we do the fsync immediately
- *		rather than building md/fd datastructures to postpone it till later.
- */
-int
-mdblindmarkdirty(RelFileNode rnode,
-				 BlockNumber blkno)
-{
-	int			status;
-	int			fd;
-
-	fd = _mdfd_blind_getseg(rnode, blkno);
-
-	if (fd < 0)
-		return SM_FAIL;
-
-	status = SM_SUCCESS;
-
-	if (pg_fsync(fd) < 0)
-		status = SM_FAIL;
-
-	if (close(fd) < 0)
-		status = SM_FAIL;
-
-	return status;
-}
-
-/*
  *	mdnblocks() -- Get the number of blocks stored in a relation.
  *
  *		Important side effect: all segments of the relation are opened
@@ -796,61 +689,36 @@ mdtruncate(Relation reln, BlockNumber nblocks)
 /*
  *	mdcommit() -- Commit a transaction.
  *
- *		All changes to magnetic disk relations must be forced to stable
- *		storage.  This routine makes a pass over the private table of
- *		file descriptors.  Any descriptors to which we have done writes,
- *		but not synced, are synced here.
- *
  *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
  */
 int
-mdcommit()
+mdcommit(void)
 {
-	int			i;
-	MdfdVec    *v;
-
-	for (i = 0; i < CurFd; i++)
-	{
-		v = &Md_fdvec[i];
-		if (v->mdfd_flags & MDFD_FREE)
-			continue;
-		/* Sync the file entry */
-#ifndef LET_OS_MANAGE_FILESIZE
-		for (; v != (MdfdVec *) NULL; v = v->mdfd_chain)
-#else
-		if (v != (MdfdVec *) NULL)
-#endif
-		{
-			if (FileSync(v->mdfd_vfd) < 0)
-				return SM_FAIL;
-		}
-	}
-
+	/*
+	 * We don't actually have to do anything here...
+	 */
 	return SM_SUCCESS;
 }
 
 /*
  *	mdabort() -- Abort a transaction.
  *
- *		Changes need not be forced to disk at transaction abort.  We mark
- *		all file descriptors as clean here.  Always returns SM_SUCCESS.
+ *		Changes need not be forced to disk at transaction abort.
  */
 int
-mdabort()
+mdabort(void)
 {
 	/*
-	 * We don't actually have to do anything here.  fd.c will discard
-	 * fsync-needed bits in its AtEOXact_Files() routine.
+	 * We don't actually have to do anything here...
 	 */
 	return SM_SUCCESS;
 }
 
 /*
- *	mdsync() -- Sync storage.
- *
+ *	mdsync() -- Sync previous writes to stable storage.
  */
 int
-mdsync()
+mdsync(void)
 {
 	sync();
 	if (IsUnderPostmaster)
@@ -861,11 +729,9 @@ mdsync()
 
 /*
  *	_fdvec_alloc () -- grab a free (or new) md file descriptor vector.
- *
  */
-static
-int
-_fdvec_alloc()
+static int
+_fdvec_alloc(void)
 {
 	MdfdVec    *nvec;
 	int			fdvec,
diff --git a/src/backend/storage/smgr/mm.c b/src/backend/storage/smgr/mm.c
index 89396d173c9..739e938fe28 100644
--- a/src/backend/storage/smgr/mm.c
+++ b/src/backend/storage/smgr/mm.c
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/mm.c,v 1.31 2002/06/20 20:29:36 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/mm.c,v 1.32 2002/08/06 02:36:34 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -81,7 +81,7 @@ static HTAB *MMCacheHT;
 static HTAB *MMRelCacheHT;
 
 int
-mminit()
+mminit(void)
 {
 	char	   *mmcacheblk;
 	int			mmsize = 0;
@@ -151,7 +151,7 @@ mminit()
 }
 
 int
-mmshutdown()
+mmshutdown(void)
 {
 	return SM_SUCCESS;
 }
@@ -443,30 +443,15 @@ mmwrite(Relation reln, BlockNumber blocknum, char *buffer)
 }
 
 /*
- *	mmflush() -- Synchronously write a block to stable storage.
- *
- *		For main-memory relations, this is exactly equivalent to mmwrite().
- */
-int
-mmflush(Relation reln, BlockNumber blocknum, char *buffer)
-{
-	return mmwrite(reln, blocknum, buffer);
-}
-
-/*
  *	mmblindwrt() -- Write a block to stable storage blind.
  *
- *		We have to be able to do this using only the name and OID of
- *		the database and relation in which the block belongs.
+ *		We have to be able to do this using only the rnode of the relation
+ *		in which the block belongs.  Otherwise this is much like mmwrite().
  */
 int
-mmblindwrt(char *dbstr,
-		   char *relstr,
-		   Oid dbid,
-		   Oid relid,
+mmblindwrt(RelFileNode rnode,
 		   BlockNumber blkno,
-		   char *buffer,
-		   bool dofsync)
+		   char *buffer)
 {
 	return SM_FAIL;
 }
@@ -512,7 +497,7 @@ mmnblocks(Relation reln)
  *		Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
  */
 int
-mmcommit()
+mmcommit(void)
 {
 	return SM_SUCCESS;
 }
@@ -522,7 +507,7 @@ mmcommit()
  */
 
 int
-mmabort()
+mmabort(void)
 {
 	return SM_SUCCESS;
 }
@@ -536,7 +521,7 @@ mmabort()
  *		manager will use.
  */
 int
-MMShmemSize()
+MMShmemSize(void)
 {
 	int			size = 0;
 
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index a7fb23b4427..252781d9c3f 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.57 2002/06/20 20:29:36 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.58 2002/08/06 02:36:34 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -40,12 +40,8 @@ typedef struct f_smgr
 										  char *buffer);
 	int			(*smgr_write) (Relation reln, BlockNumber blocknum,
 										   char *buffer);
-	int			(*smgr_flush) (Relation reln, BlockNumber blocknum,
-										   char *buffer);
 	int			(*smgr_blindwrt) (RelFileNode rnode, BlockNumber blkno,
-											  char *buffer, bool dofsync);
-	int			(*smgr_markdirty) (Relation reln, BlockNumber blkno);
-	int			(*smgr_blindmarkdirty) (RelFileNode, BlockNumber blkno);
+											  char *buffer);
 	BlockNumber (*smgr_nblocks) (Relation reln);
 	BlockNumber (*smgr_truncate) (Relation reln, BlockNumber nblocks);
 	int			(*smgr_commit) (void);	/* may be NULL */
@@ -62,15 +58,15 @@ static f_smgr smgrsw[] = {
 
 	/* magnetic disk */
 	{mdinit, NULL, mdcreate, mdunlink, mdextend, mdopen, mdclose,
-		mdread, mdwrite, mdflush, mdblindwrt, mdmarkdirty, mdblindmarkdirty,
+		mdread, mdwrite, mdblindwrt,
 		mdnblocks, mdtruncate, mdcommit, mdabort, mdsync
 	},
 
 #ifdef STABLE_MEMORY_STORAGE
 	/* main memory */
 	{mminit, mmshutdown, mmcreate, mmunlink, mmextend, mmopen, mmclose,
-		mmread, mmwrite, mmflush, mmblindwrt, mmmarkdirty, mmblindmarkdirty,
-	mmnblocks, NULL, mmcommit, mmabort},
+		mmread, mmwrite, mmblindwrt,
+		mmnblocks, NULL, mmcommit, mmabort, NULL},
 #endif
 };
 
@@ -110,6 +106,7 @@ typedef struct PendingRelDelete
 {
 	RelFileNode relnode;		/* relation that may need to be deleted */
 	int16		which;			/* which storage manager? */
+	bool		isTemp;			/* is it a temporary relation? */
 	bool		atCommit;		/* T=delete at commit; F=delete at abort */
 	struct PendingRelDelete *next;		/* linked-list link */
 } PendingRelDelete;
@@ -123,7 +120,7 @@ static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
  *
  */
 int
-smgrinit()
+smgrinit(void)
 {
 	int			i;
 
@@ -181,6 +178,7 @@ smgrcreate(int16 which, Relation reln)
 		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
 	pending->relnode = reln->rd_node;
 	pending->which = which;
+	pending->isTemp = reln->rd_istemp;
 	pending->atCommit = false;	/* delete if abort */
 	pending->next = pendingDeletes;
 	pendingDeletes = pending;
@@ -208,6 +206,7 @@ smgrunlink(int16 which, Relation reln)
 		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
 	pending->relnode = reln->rd_node;
 	pending->which = which;
+	pending->isTemp = reln->rd_istemp;
 	pending->atCommit = true;	/* delete if commit */
 	pending->next = pendingDeletes;
 	pendingDeletes = pending;
@@ -312,8 +311,10 @@ smgrread(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
 /*
  *	smgrwrite() -- Write the supplied buffer out.
  *
- *		This is not a synchronous write -- the interface for that is
- *		smgrflush().  The buffer is written out via the appropriate
+ *		This is not a synchronous write -- the block is not necessarily
+ *		on disk at return, only dumped out to the kernel.
+ *
+ *		The buffer is written out via the appropriate
  *		storage manager.  This routine returns SM_SUCCESS or aborts
  *		the current transaction.
  */
@@ -332,23 +333,6 @@ smgrwrite(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
 }
 
 /*
- *	smgrflush() -- A synchronous smgrwrite().
- */
-int
-smgrflush(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
-{
-	int			status;
-
-	status = (*(smgrsw[which].smgr_flush)) (reln, blocknum, buffer);
-
-	if (status == SM_FAIL)
-		elog(ERROR, "cannot flush block %d of %s to stable store: %m",
-			 blocknum, RelationGetRelationName(reln));
-
-	return status;
-}
-
-/*
  *	smgrblindwrt() -- Write a page out blind.
  *
  *		In some cases, we may find a page in the buffer cache that we
@@ -357,20 +341,18 @@ smgrflush(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
  *		that has not yet committed, which created a new relation.  In
  *		this case, the buffer manager will call smgrblindwrt() with
  *		the name and OID of the database and the relation to which the
- *		buffer belongs.  Every storage manager must be able to force
- *		this page down to stable storage in this circumstance.	The
- *		write should be synchronous if dofsync is true.
+ *		buffer belongs.  Every storage manager must be able to write
+ *		this page out to stable storage in this circumstance.
  */
 int
 smgrblindwrt(int16 which,
 			 RelFileNode rnode,
 			 BlockNumber blkno,
-			 char *buffer,
-			 bool dofsync)
+			 char *buffer)
 {
 	int			status;
 
-	status = (*(smgrsw[which].smgr_blindwrt)) (rnode, blkno, buffer, dofsync);
+	status = (*(smgrsw[which].smgr_blindwrt)) (rnode, blkno, buffer);
 
 	if (status == SM_FAIL)
 		elog(ERROR, "cannot write block %d of %u/%u blind: %m",
@@ -380,53 +362,6 @@ smgrblindwrt(int16 which,
 }
 
 /*
- *	smgrmarkdirty() -- Mark a page dirty (needs fsync).
- *
- *		Mark the specified page as needing to be fsync'd before commit.
- *		Ordinarily, the storage manager will do this implicitly during
- *		smgrwrite().  However, the buffer manager may discover that some
- *		other backend has written a buffer that we dirtied in the current
- *		transaction.  In that case, we still need to fsync the file to be
- *		sure the page is down to disk before we commit.
- */
-int
-smgrmarkdirty(int16 which,
-			  Relation reln,
-			  BlockNumber blkno)
-{
-	int			status;
-
-	status = (*(smgrsw[which].smgr_markdirty)) (reln, blkno);
-
-	if (status == SM_FAIL)
-		elog(ERROR, "cannot mark block %d of %s: %m",
-			 blkno, RelationGetRelationName(reln));
-
-	return status;
-}
-
-/*
- *	smgrblindmarkdirty() -- Mark a page dirty, "blind".
- *
- *		Just like smgrmarkdirty, except we don't have a reldesc.
- */
-int
-smgrblindmarkdirty(int16 which,
-				   RelFileNode rnode,
-				   BlockNumber blkno)
-{
-	int			status;
-
-	status = (*(smgrsw[which].smgr_blindmarkdirty)) (rnode, blkno);
-
-	if (status == SM_FAIL)
-		elog(ERROR, "cannot mark block %d of %u/%u blind: %m",
-			 blkno, rnode.tblNode, rnode.relNode);
-
-	return status;
-}
-
-/*
  *	smgrnblocks() -- Calculate the number of POSTGRES blocks in the
  *					 supplied relation.
  *
@@ -504,7 +439,7 @@ smgrDoPendingDeletes(bool isCommit)
 			 * any in the commit case, but there can be in the abort
 			 * case).
 			 */
-			DropRelFileNodeBuffers(pending->relnode);
+			DropRelFileNodeBuffers(pending->relnode, pending->isTemp);
 
 			/*
 			 * Tell the free space map to forget this relation.  It won't
@@ -531,11 +466,13 @@ smgrDoPendingDeletes(bool isCommit)
 }
 
 /*
- *	smgrcommit(), smgrabort() -- Commit or abort changes made during the
- *								 current transaction.
+ *	smgrcommit() -- Prepare to commit changes made during the current
+ *					transaction.
+ *
+ * This is called before we actually commit.
  */
 int
-smgrcommit()
+smgrcommit(void)
 {
 	int			i;
 
@@ -553,8 +490,11 @@ smgrcommit()
 	return SM_SUCCESS;
 }
 
+/*
+ *	smgrabort() -- Abort changes made during the current transaction.
+ */
 int
-smgrabort()
+smgrabort(void)
 {
 	int			i;
 
@@ -572,8 +512,11 @@ smgrabort()
 	return SM_SUCCESS;
 }
 
+/*
+ * Sync files to disk at checkpoint time.
+ */
 int
-smgrsync()
+smgrsync(void)
 {
 	int			i;