Per previous discussions, get rid of use of sync(2) in favor of

explicitly fsync'ing every (non-temp) file we have written since the last checkpoint. In the vast majority of cases, the burden of the fsyncs should fall on the bgwriter process not on backends. (To this end, we assume that an fsync issued by the bgwriter will force out blocks written to the same file by other processes using other file descriptors. Anyone have a problem with that?) This makes the world safe for WIN32, which ain't even got sync(2), and really makes the world safe for Unixen as well, because sync(2) never had the semantics we need: it offers no way to wait for the requested I/O to finish. Along the way, fix a bug I recently introduced in xlog recovery: file truncation replay failed to clear bufmgr buffers for the dropped blocks, which could result in 'PANIC: heap_delete_redo: no block' later on in xlog replay.
author: Tom Lane <tgl@sss.pgh.pa.us> 2004-05-31 03:48:10 +0000
committer: Tom Lane <tgl@sss.pgh.pa.us> 2004-05-31 03:48:10 +0000
commit: 9b178555fc1f5087c120ff4d26380395bc655a03 (patch)
tree: 3578c76707795c2b25910ea42b36928eb6d4d742 /src/backend/storage
parent: f024086db30f26905e4c877a6795c1ab95f4ab12 (diff)
download: postgresql-9b178555fc1f5087c120ff4d26380395bc655a03.tar.gz
postgresql-9b178555fc1f5087c120ff4d26380395bc655a03.zip
4 files changed, 317 insertions, 82 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f718e33cd59..2386bc89bf3 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.166 2004/05/29 22:48:19 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.167 2004/05/31 03:48:02 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1044,6 +1044,9 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
  *		bothering to write them out first.	This is NOT rollback-able,
  *		and so should be used only with extreme caution!
  *
+ *		There is no particularly good reason why this doesn't have a
+ *		firstDelBlock parameter, except that current callers don't need it.
+ *
  *		We assume that the caller holds an exclusive lock on the relation,
  *		which should assure that no new buffers will be acquired for the rel
  *		meanwhile.
@@ -1052,14 +1055,15 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 void
 DropRelationBuffers(Relation rel)
 {
-	DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp);
+	DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp, 0);
 }
 
 /* ---------------------------------------------------------------------
  *		DropRelFileNodeBuffers
  *
  *		This is the same as DropRelationBuffers, except that the target
- *		relation is specified by RelFileNode and temp status.
+ *		relation is specified by RelFileNode and temp status, and one
+ *		may specify the first block to drop.
  *
  *		This is NOT rollback-able.	One legitimate use is to clear the
  *		buffer cache of buffers for a relation that is being deleted
@@ -1067,7 +1071,8 @@ DropRelationBuffers(Relation rel)
  * --------------------------------------------------------------------
  */
 void
-DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
+DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
+					   BlockNumber firstDelBlock)
 {
 	int			i;
 	BufferDesc *bufHdr;
@@ -1077,7 +1082,8 @@ DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
 		for (i = 0; i < NLocBuffer; i++)
 		{
 			bufHdr = &LocalBufferDescriptors[i];
-			if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+			if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+				bufHdr->tag.blockNum >= firstDelBlock)
 			{
 				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
 				bufHdr->cntxDirty = false;
@@ -1094,7 +1100,8 @@ DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
 	{
 		bufHdr = &BufferDescriptors[i - 1];
 recheck:
-		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+			bufHdr->tag.blockNum >= firstDelBlock)
 		{
 			/*
 			 * If there is I/O in progress, better wait till it's done;
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 5ef12de9495..96de54110cf 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.108 2004/02/23 23:03:10 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.109 2004/05/31 03:48:04 tgl Exp $
  *
  * NOTES:
  *
@@ -484,6 +484,7 @@ Insert(File file)
 	DO_DB(_dump_lru());
 }
 
+/* returns 0 on success, -1 on re-open failure (with errno set) */
 static int
 LruInsert(File file)
 {
@@ -685,6 +686,7 @@ filepath(const char *filename)
 	return buf;
 }
 
+/* returns 0 on success, -1 on re-open failure (with errno set) */
 static int
 FileAccess(File file)
 {
@@ -954,7 +956,10 @@ FileRead(File file, char *buffer, int amount)
 			   file, VfdCache[file].fileName,
 			   VfdCache[file].seekPos, amount, buffer));
 
-	FileAccess(file);
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
 	returnCode = read(VfdCache[file].fd, buffer, amount);
 	if (returnCode > 0)
 		VfdCache[file].seekPos += returnCode;
@@ -975,7 +980,9 @@ FileWrite(File file, char *buffer, int amount)
 			   file, VfdCache[file].fileName,
 			   VfdCache[file].seekPos, amount, buffer));
 
-	FileAccess(file);
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
 
 	errno = 0;
 	returnCode = write(VfdCache[file].fd, buffer, amount);
@@ -992,9 +999,28 @@ FileWrite(File file, char *buffer, int amount)
 	return returnCode;
 }
 
+int
+FileSync(File file)
+{
+	int			returnCode;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileSync: %d (%s)",
+			   file, VfdCache[file].fileName));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	return pg_fsync(VfdCache[file].fd);
+}
+
 long
 FileSeek(File file, long offset, int whence)
 {
+	int			returnCode;
+
 	Assert(FileIsValid(file));
 
 	DO_DB(elog(LOG, "FileSeek: %d (%s) %ld %ld %d",
@@ -1014,8 +1040,11 @@ FileSeek(File file, long offset, int whence)
 				VfdCache[file].seekPos += offset;
 				break;
 			case SEEK_END:
-				FileAccess(file);
-				VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+				returnCode = FileAccess(file);
+				if (returnCode < 0)
+					return returnCode;
+				VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+											   offset, whence);
 				break;
 			default:
 				elog(ERROR, "invalid whence: %d", whence);
@@ -1030,14 +1059,17 @@ FileSeek(File file, long offset, int whence)
 				if (offset < 0)
 					elog(ERROR, "invalid seek offset: %ld", offset);
 				if (VfdCache[file].seekPos != offset)
-					VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+					VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+												   offset, whence);
 				break;
 			case SEEK_CUR:
 				if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
-					VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+					VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+												   offset, whence);
 				break;
 			case SEEK_END:
-				VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+				VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+											   offset, whence);
 				break;
 			default:
 				elog(ERROR, "invalid whence: %d", whence);
@@ -1071,7 +1103,10 @@ FileTruncate(File file, long offset)
 	DO_DB(elog(LOG, "FileTruncate %d (%s)",
 			   file, VfdCache[file].fileName));
 
-	FileAccess(file);
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
 	returnCode = ftruncate(VfdCache[file].fd, (size_t) offset);
 	return returnCode;
 }
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 2122a243207..5ac5868f690 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.104 2004/04/19 17:42:58 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.105 2004/05/31 03:48:06 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -21,8 +21,10 @@
 
 #include "catalog/catalog.h"
 #include "miscadmin.h"
+#include "postmaster/bgwriter.h"
 #include "storage/fd.h"
 #include "storage/smgr.h"
+#include "utils/hsearch.h"
 #include "utils/memutils.h"
 
 
@@ -33,37 +35,68 @@
  *	system's file size limit (often 2GBytes).  In order to do that,
  *	we break relations up into chunks of < 2GBytes and store one chunk
  *	in each of several files that represent the relation.  See the
- *	BLCKSZ and RELSEG_SIZE configuration constants in
- *	include/pg_config.h.  All chunks except the last MUST have size exactly
- *	equal to RELSEG_SIZE blocks --- see mdnblocks() and mdtruncate().
+ *	BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h.
+ *	All chunks except the last MUST have size exactly equal to RELSEG_SIZE
+ *	blocks --- see mdnblocks() and mdtruncate().
  *
  *	The file descriptor pointer (md_fd field) stored in the SMgrRelation
  *	cache is, therefore, just the head of a list of MdfdVec objects.
  *	But note the md_fd pointer can be NULL, indicating relation not open.
  *
+ *	Note that mdfd_chain == NULL does not necessarily mean the relation
+ *	doesn't have another segment after this one; we may just not have
+ *	opened the next segment yet.  (We could not have "all segments are
+ *	in the chain" as an invariant anyway, since another backend could
+ *	extend the relation when we weren't looking.)
+ *
  *	All MdfdVec objects are palloc'd in the MdCxt memory context.
  */
 
 typedef struct _MdfdVec
 {
 	File		mdfd_vfd;			/* fd number in fd.c's pool */
-
-#ifndef LET_OS_MANAGE_FILESIZE
-	struct _MdfdVec *mdfd_chain;	/* for large relations */
+	BlockNumber	mdfd_segno;			/* segment number, from 0 */
+#ifndef LET_OS_MANAGE_FILESIZE		/* for large relations */
+	struct _MdfdVec *mdfd_chain;	/* next segment, or NULL */
 #endif
 } MdfdVec;
 
 static MemoryContext MdCxt;		/* context for all md.c allocations */
 
 
-/* routines declared here */
-static MdfdVec *mdopen(SMgrRelation reln);
+/*
+ * In some contexts (currently, standalone backends and the bgwriter process)
+ * we keep track of pending fsync operations: we need to remember all relation
+ * segments that have been written since the last checkpoint, so that we can
+ * fsync them down to disk before completing the next checkpoint.  This hash
+ * table remembers the pending operations.  We use a hash table not because
+ * we want to look up individual operations, but simply as a convenient way
+ * of eliminating duplicate requests.
+ *
+ * (Regular backends do not track pending operations locally, but forward
+ * them to the bgwriter.)
+ *
+ * XXX for WIN32, may want to expand this to track pending deletes, too.
+ */
+typedef struct
+{
+	RelFileNode	rnode;			/* the targeted relation */
+	BlockNumber	segno;			/* which segment */
+} PendingOperationEntry;
+
+static HTAB *pendingOpsTable = NULL;
+
+
+/* local routines */
+static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound);
+static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
 static MdfdVec *_fdvec_alloc(void);
 #ifndef LET_OS_MANAGE_FILESIZE
 static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
 							  int oflags);
 #endif
-static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno);
+static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
+							 bool allowNotFound);
 static BlockNumber _mdnblocks(File file, Size blcksz);
 
 
@@ -79,6 +112,31 @@ mdinit(void)
 								  ALLOCSET_DEFAULT_INITSIZE,
 								  ALLOCSET_DEFAULT_MAXSIZE);
 
+	/*
+	 * Create pending-operations hashtable if we need it.  Currently,
+	 * we need it if we are standalone (not under a postmaster) OR
+	 * if we are a bootstrap-mode subprocess of a postmaster (that is,
+	 * a startup or bgwriter process).
+	 */
+	if (!IsUnderPostmaster || IsBootstrapProcessingMode())
+	{
+		HASHCTL		hash_ctl;
+
+		MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+		hash_ctl.keysize = sizeof(PendingOperationEntry);
+		hash_ctl.entrysize = sizeof(PendingOperationEntry);
+		hash_ctl.hash = tag_hash;
+		hash_ctl.hcxt = MdCxt;
+		pendingOpsTable = hash_create("Pending Ops Table",
+									  100L,
+									  &hash_ctl,
+									  HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+		if (pendingOpsTable == NULL)
+			ereport(FATAL,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+	}
+
 	return true;
 }
 
@@ -130,6 +188,7 @@ mdcreate(SMgrRelation reln, bool isRedo)
 	reln->md_fd = _fdvec_alloc();
 
 	reln->md_fd->mdfd_vfd = fd;
+	reln->md_fd->mdfd_segno = 0;
 #ifndef LET_OS_MANAGE_FILESIZE
 	reln->md_fd->mdfd_chain = NULL;
 #endif
@@ -217,7 +276,7 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 	int			nbytes;
 	MdfdVec    *v;
 
-	v = _mdfd_getseg(reln, blocknum);
+	v = _mdfd_getseg(reln, blocknum, false);
 
 #ifndef LET_OS_MANAGE_FILESIZE
 	seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -252,6 +311,9 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 		return false;
 	}
 
+	if (!register_dirty_segment(reln, v))
+		return false;
+
 #ifndef LET_OS_MANAGE_FILESIZE
 	Assert(_mdnblocks(v->mdfd_vfd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
 #endif
@@ -261,12 +323,14 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 
 /*
  *	mdopen() -- Open the specified relation.  ereport's on failure.
+ *		(Optionally, can return NULL instead of ereport for ENOENT.)
  *
  * Note we only open the first segment, when there are multiple segments.
  */
 static MdfdVec *
-mdopen(SMgrRelation reln)
+mdopen(SMgrRelation reln, bool allowNotFound)
 {
+	MdfdVec	   *mdfd;
 	char	   *path;
 	File		fd;
 
@@ -292,6 +356,8 @@ mdopen(SMgrRelation reln)
 		if (fd < 0)
 		{
 			pfree(path);
+			if (allowNotFound && errno == ENOENT)
+				return NULL;
 			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not open relation %u/%u: %m",
@@ -302,15 +368,16 @@ mdopen(SMgrRelation reln)
 
 	pfree(path);
 
-	reln->md_fd = _fdvec_alloc();
+	reln->md_fd = mdfd = _fdvec_alloc();
 
-	reln->md_fd->mdfd_vfd = fd;
+	mdfd->mdfd_vfd = fd;
+	mdfd->mdfd_segno = 0;
 #ifndef LET_OS_MANAGE_FILESIZE
-	reln->md_fd->mdfd_chain = NULL;
+	mdfd->mdfd_chain = NULL;
 	Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
 #endif
 
-	return reln->md_fd;
+	return mdfd;
 }
 
 /*
@@ -361,7 +428,7 @@ mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 	int			nbytes;
 	MdfdVec    *v;
 
-	v = _mdfd_getseg(reln, blocknum);
+	v = _mdfd_getseg(reln, blocknum, false);
 
 #ifndef LET_OS_MANAGE_FILESIZE
 	seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -403,7 +470,7 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 	long		seekpos;
 	MdfdVec    *v;
 
-	v = _mdfd_getseg(reln, blocknum);
+	v = _mdfd_getseg(reln, blocknum, false);
 
 #ifndef LET_OS_MANAGE_FILESIZE
 	seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -418,6 +485,9 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
 		return false;
 
+	if (!register_dirty_segment(reln, v))
+		return false;
+
 	return true;
 }
 
@@ -434,7 +504,7 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 BlockNumber
 mdnblocks(SMgrRelation reln)
 {
-	MdfdVec    *v = mdopen(reln);
+	MdfdVec    *v = mdopen(reln, false);
 
 #ifndef LET_OS_MANAGE_FILESIZE
 	BlockNumber nblocks;
@@ -516,7 +586,7 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks)
 	if (nblocks == curnblk)
 		return nblocks;			/* no work */
 
-	v = mdopen(reln);
+	v = mdopen(reln, false);
 
 #ifndef LET_OS_MANAGE_FILESIZE
 	priorblocks = 0;
@@ -576,40 +646,154 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks)
 }
 
 /*
- *	mdcommit() -- Commit a transaction.
+ *	mdsync() -- Sync previous writes to stable storage.
+ *
+ * This is only called during checkpoints, and checkpoints should only
+ * occur in processes that have created a pendingOpsTable.
  */
 bool
-mdcommit(void)
+mdsync(void)
 {
+	HASH_SEQ_STATUS hstat;
+	PendingOperationEntry *entry;
+
+	if (!pendingOpsTable)
+		return false;
+
 	/*
-	 * We don't actually have to do anything here...
+	 * If we are in the bgwriter, the sync had better include all fsync
+	 * requests that were queued by backends before the checkpoint REDO
+	 * point was determined.  We go that a little better by accepting
+	 * all requests queued up to the point where we start fsync'ing.
 	 */
+	AbsorbFsyncRequests();
+
+	hash_seq_init(&hstat, pendingOpsTable);
+	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+	{
+		/*
+		 * If fsync is off then we don't have to bother opening the file
+		 * at all.  (We delay checking until this point so that changing
+		 * fsync on the fly behaves sensibly.)
+		 */
+		if (enableFsync)
+		{
+			SMgrRelation reln;
+			MdfdVec *seg;
+
+			/*
+			 * Find or create an smgr hash entry for this relation.
+			 * This may seem a bit unclean -- md calling smgr?  But it's
+			 * really the best solution.  It ensures that the open file
+			 * reference isn't permanently leaked if we get an error here.
+			 * (You may say "but an unreferenced SMgrRelation is still a
+			 * leak!"  Not really, because the only case in which a checkpoint
+			 * is done by a process that isn't about to shut down is in the
+			 * bgwriter, and it will periodically do smgrcloseall().  This
+			 * fact justifies our not closing the reln in the success path
+			 * either, which is a good thing since in non-bgwriter cases
+			 * we couldn't safely do that.)  Furthermore, in many cases
+			 * the relation will have been dirtied through this same smgr
+			 * relation, and so we can save a file open/close cycle.
+			 */
+			reln = smgropen(entry->rnode);
+
+			/*
+			 * It is possible that the relation has been dropped or truncated
+			 * since the fsync request was entered.  Therefore, we have to
+			 * allow file-not-found errors.  This applies both during
+			 * _mdfd_getseg() and during FileSync, since fd.c might have
+			 * closed the file behind our back.
+			 */
+			seg = _mdfd_getseg(reln,
+							   entry->segno * ((BlockNumber) RELSEG_SIZE),
+							   true);
+			if (seg)
+			{
+				if (FileSync(seg->mdfd_vfd) < 0 &&
+					errno != ENOENT)
+				{
+					ereport(LOG,
+							(errcode_for_file_access(),
+							 errmsg("could not fsync segment %u of relation %u/%u: %m",
+									entry->segno,
+									entry->rnode.tblNode,
+									entry->rnode.relNode)));
+					return false;
+				}
+			}
+		}
+
+		/* Okay, delete this entry */
+		if (hash_search(pendingOpsTable, entry,
+						HASH_REMOVE, NULL) == NULL)
+			elog(ERROR, "pendingOpsTable corrupted");
+	}
+
 	return true;
 }
 
 /*
- *	mdabort() -- Abort a transaction.
+ * register_dirty_segment() -- Mark a relation segment as needing fsync
+ *
+ * If there is a local pending-ops table, just make an entry in it for
+ * mdsync to process later.  Otherwise, try to pass off the fsync request
+ * to the background writer process.  If that fails, just do the fsync
+ * locally before returning (we expect this will not happen often enough
+ * to be a performance problem).
+ *
+ * A false result implies I/O failure during local fsync.  errno will be
+ * valid for error reporting.
  */
-bool
-mdabort(void)
+static bool
+register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
 {
-	/*
-	 * We don't actually have to do anything here...
-	 */
+	if (pendingOpsTable)
+	{
+		PendingOperationEntry entry;
+
+		/* ensure any pad bytes in the struct are zeroed */
+		MemSet(&entry, 0, sizeof(entry));
+		entry.rnode = reln->smgr_rnode;
+		entry.segno = seg->mdfd_segno;
+
+		if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) != NULL)
+			return true;
+		/* out of memory: fall through to do it locally */
+	}
+	else
+	{
+		if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
+			return true;
+	}
+
+	if (FileSync(seg->mdfd_vfd) < 0)
+		return false;
 	return true;
 }
 
 /*
- *	mdsync() -- Sync previous writes to stable storage.
+ * RememberFsyncRequest() -- callback from bgwriter side of fsync request
+ *
+ * We stuff the fsync request into the local hash table for execution
+ * during the bgwriter's next checkpoint.
  */
-bool
-mdsync(void)
+void
+RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
 {
-	sync();
-	if (IsUnderPostmaster)
-		pg_usleep(2000000L);
-	sync();
-	return true;
+	PendingOperationEntry entry;
+
+	Assert(pendingOpsTable);
+
+	/* ensure any pad bytes in the struct are zeroed */
+	MemSet(&entry, 0, sizeof(entry));
+	entry.rnode = rnode;
+	entry.segno = segno;
+
+	if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) == NULL)
+		ereport(FATAL,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
 }
 
 /*
@@ -618,18 +802,11 @@ mdsync(void)
 static MdfdVec *
 _fdvec_alloc(void)
 {
-	MdfdVec *v;
-
-	v = (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
-	v->mdfd_vfd = -1;
-#ifndef LET_OS_MANAGE_FILESIZE
-	v->mdfd_chain = NULL;
-#endif
-
-	return v;
+	return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
 }
 
 #ifndef LET_OS_MANAGE_FILESIZE
+
 /*
  * Open the specified segment of the relation,
  * and make a MdfdVec object for it.  Returns NULL on failure.
@@ -642,11 +819,11 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
 	char	   *path,
 			   *fullpath;
 
-	/* be sure we have enough space for the '.segno', if any */
 	path = relpath(reln->smgr_rnode);
 
 	if (segno > 0)
 	{
+		/* be sure we have enough space for the '.segno' */
 		fullpath = (char *) palloc(strlen(path) + 12);
 		sprintf(fullpath, "%s.%u", path, segno);
 		pfree(path);
@@ -667,32 +844,36 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
 
 	/* fill the entry */
 	v->mdfd_vfd = fd;
+	v->mdfd_segno = segno;
 	v->mdfd_chain = NULL;
 	Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
 
 	/* all done */
 	return v;
 }
-#endif
+
+#endif /* LET_OS_MANAGE_FILESIZE */
 
 /*
  *	_mdfd_getseg() -- Find the segment of the relation holding the
- *					  specified block.  ereport's on failure.
+ *		specified block.  ereport's on failure.
+ *		(Optionally, can return NULL instead of ereport for ENOENT.)
  */
 static MdfdVec *
-_mdfd_getseg(SMgrRelation reln, BlockNumber blkno)
+_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
 {
-	MdfdVec    *v = mdopen(reln);
-
+	MdfdVec    *v = mdopen(reln, allowNotFound);
 #ifndef LET_OS_MANAGE_FILESIZE
-	BlockNumber segno;
-	BlockNumber i;
+	BlockNumber segstogo;
+	BlockNumber nextsegno;
 
-	for (segno = blkno / ((BlockNumber) RELSEG_SIZE), i = 1;
-		 segno > 0;
-		 i++, segno--)
-	{
+	if (!v)
+		return NULL;			/* only possible if allowNotFound */
 
+	for (segstogo = blkno / ((BlockNumber) RELSEG_SIZE), nextsegno = 1;
+		 segstogo > 0;
+		 nextsegno++, segstogo--)
+	{
 		if (v->mdfd_chain == NULL)
 		{
 			/*
@@ -705,16 +886,21 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno)
 			 * one new segment per call, so this restriction seems
 			 * reasonable.
 			 */
-			v->mdfd_chain = _mdfd_openseg(reln, i, (segno == 1) ? O_CREAT : 0);
-
+			v->mdfd_chain = _mdfd_openseg(reln,
+										  nextsegno,
+										  (segstogo == 1) ? O_CREAT : 0);
 			if (v->mdfd_chain == NULL)
+			{
+				if (allowNotFound && errno == ENOENT)
+					return NULL;
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not open segment %u of relation %u/%u (target block %u): %m",
-								i,
+								nextsegno,
 								reln->smgr_rnode.tblNode,
 								reln->smgr_rnode.relNode,
 								blkno)));
+			}
 		}
 		v = v->mdfd_chain;
 	}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index d242744a4d7..c204e2796c4 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.70 2004/02/11 22:55:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.71 2004/05/31 03:48:06 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -56,7 +56,7 @@ typedef struct f_smgr
 static const f_smgr smgrsw[] = {
 	/* magnetic disk */
 	{mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend,
-	 mdread, mdwrite, mdnblocks, mdtruncate, mdcommit, mdabort, mdsync
+	 mdread, mdwrite, mdnblocks, mdtruncate, NULL, NULL, mdsync
 	}
 };
 
@@ -407,7 +407,7 @@ smgr_internal_unlink(RelFileNode rnode, int which, bool isTemp, bool isRedo)
 	 * Get rid of any leftover buffers for the rel (shouldn't be any in the
 	 * commit case, but there can be in the abort case).
 	 */
-	DropRelFileNodeBuffers(rnode, isTemp);
+	DropRelFileNodeBuffers(rnode, isTemp, 0);
 
 	/*
 	 * Tell the free space map to forget this relation.  It won't be accessed
@@ -638,7 +638,7 @@ smgrcommit(void)
 		if (smgrsw[i].smgr_commit)
 		{
 			if (! (*(smgrsw[i].smgr_commit)) ())
-				elog(FATAL, "transaction commit failed on %s: %m",
+				elog(ERROR, "transaction commit failed on %s: %m",
 					 DatumGetCString(DirectFunctionCall1(smgrout,
 													 Int16GetDatum(i))));
 		}
@@ -658,7 +658,7 @@ smgrabort(void)
 		if (smgrsw[i].smgr_abort)
 		{
 			if (! (*(smgrsw[i].smgr_abort)) ())
-				elog(FATAL, "transaction abort failed on %s: %m",
+				elog(ERROR, "transaction abort failed on %s: %m",
 					 DatumGetCString(DirectFunctionCall1(smgrout,
 													 Int16GetDatum(i))));
 		}
@@ -678,7 +678,7 @@ smgrsync(void)
 		if (smgrsw[i].smgr_sync)
 		{
 			if (! (*(smgrsw[i].smgr_sync)) ())
-				elog(PANIC, "storage sync failed on %s: %m",
+				elog(ERROR, "storage sync failed on %s: %m",
 					 DatumGetCString(DirectFunctionCall1(smgrout,
 													 Int16GetDatum(i))));
 		}
@@ -707,6 +707,13 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record)
 
 		reln = smgropen(xlrec->rnode);
 
+		/*
+		 * First, force bufmgr to drop any buffers it has for the to-be-
+		 * truncated blocks.  We must do this, else subsequent XLogReadBuffer
+		 * operations will not re-extend the file properly.
+		 */
+		DropRelFileNodeBuffers(xlrec->rnode, false, xlrec->blkno);
+
 		/* Can't use smgrtruncate because it would try to xlog */
 
 		/*
author	Tom Lane <tgl@sss.pgh.pa.us>	2004-05-31 03:48:10 +0000
committer	Tom Lane <tgl@sss.pgh.pa.us>	2004-05-31 03:48:10 +0000
commit	9b178555fc1f5087c120ff4d26380395bc655a03 (patch)
tree	3578c76707795c2b25910ea42b36928eb6d4d742 /src/backend/storage
parent	f024086db30f26905e4c877a6795c1ab95f4ab12 (diff)
download	postgresql-9b178555fc1f5087c120ff4d26380395bc655a03.tar.gz postgresql-9b178555fc1f5087c120ff4d26380395bc655a03.zip