aboutsummaryrefslogtreecommitdiff
path: root/src/backend/storage/buffer/bufmgr.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage/buffer/bufmgr.c')
-rw-r--r--src/backend/storage/buffer/bufmgr.c606
1 files changed, 419 insertions, 187 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index acc719ca4b6..0887f3d1ecd 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.77 2000/03/31 02:43:31 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.78 2000/04/09 04:43:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -94,8 +94,10 @@ static Buffer ReadBufferWithBufferLock(Relation relation, BlockNumber blockNum,
bool bufferLockHeld);
static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum,
bool *foundPtr, bool bufferLockHeld);
+static void SetBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr);
+static void ClearBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr);
static void BufferSync(void);
-static int BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld);
+static int BufferReplace(BufferDesc *bufHdr);
void PrintBufferDescs(void);
/* ---------------------------------------------------
@@ -176,7 +178,7 @@ is_userbuffer(Buffer buffer)
{
BufferDesc *buf = &BufferDescriptors[buffer - 1];
- if (IsSystemRelationName(buf->sb_relname))
+ if (IsSystemRelationName(buf->blind.relname))
return false;
return true;
}
@@ -199,7 +201,7 @@ ReadBuffer_Debug(char *file,
fprintf(stderr, "PIN(RD) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
- buffer, buf->sb_relname, buf->tag.blockNum,
+ buffer, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
return buffer;
@@ -390,22 +392,21 @@ BufferAlloc(Relation reln,
* If there's no IO for the buffer and the buffer
* is BROKEN,it should be read again. So start a
* new buffer IO here.
-
- *
- * wierd race condition:
- *
- * We were waiting for someone else to read the buffer. While
- * we were waiting, the reader boof'd in some way, so the
- * contents of the buffer are still invalid. By saying
- * that we didn't find it, we can make the caller
- * reinitialize the buffer. If two processes are waiting
- * for this block, both will read the block. The second
- * one to finish may overwrite any updates made by the
- * first. (Assume higher level synchronization prevents
- * this from happening).
- *
- * This is never going to happen, don't worry about it.
- */
+ *
+ * wierd race condition:
+ *
+ * We were waiting for someone else to read the buffer. While
+ * we were waiting, the reader boof'd in some way, so the
+ * contents of the buffer are still invalid. By saying
+ * that we didn't find it, we can make the caller
+ * reinitialize the buffer. If two processes are waiting
+ * for this block, both will read the block. The second
+ * one to finish may overwrite any updates made by the
+ * first. (Assume higher level synchronization prevents
+ * this from happening).
+ *
+ * This is never going to happen, don't worry about it.
+ */
*foundPtr = FALSE;
}
#ifdef BMTRACE
@@ -465,33 +466,24 @@ BufferAlloc(Relation reln,
* in WaitIO until we're done.
*/
inProgress = TRUE;
-#ifdef HAS_TEST_AND_SET
/*
* All code paths that acquire this lock pin the buffer first;
* since no one had it pinned (it just came off the free
* list), no one else can have this lock.
*/
-#endif /* HAS_TEST_AND_SET */
StartBufferIO(buf, false);
/*
* Write the buffer out, being careful to release BufMgrLock
* before starting the I/O.
- *
- * This #ifndef is here because a few extra semops REALLY kill
- * you on machines that don't have spinlocks. If you don't
- * operate with much concurrency, well...
*/
- smok = BufferReplace(buf, true);
-#ifndef OPTIMIZE_SINGLE
- SpinAcquire(BufMgrLock);
-#endif /* OPTIMIZE_SINGLE */
+ smok = BufferReplace(buf);
if (smok == FALSE)
{
elog(NOTICE, "BufferAlloc: cannot write block %u for %s/%s",
- buf->tag.blockNum, buf->sb_dbname, buf->sb_relname);
+ buf->tag.blockNum, buf->blind.dbname, buf->blind.relname);
inProgress = FALSE;
buf->flags |= BM_IO_ERROR;
buf->flags &= ~BM_IO_IN_PROGRESS;
@@ -516,7 +508,7 @@ BufferAlloc(Relation reln,
if (buf->flags & BM_JUST_DIRTIED)
{
elog(FATAL, "BufferAlloc: content of block %u (%s) changed while flushing",
- buf->tag.blockNum, buf->sb_relname);
+ buf->tag.blockNum, buf->blind.relname);
}
else
buf->flags &= ~BM_DIRTY;
@@ -562,6 +554,7 @@ BufferAlloc(Relation reln,
*/
if (buf != NULL)
{
+ buf->flags &= ~BM_IO_IN_PROGRESS;
TerminateBufferIO(buf);
/* give up the buffer since we don't need it any more */
PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0;
@@ -572,7 +565,6 @@ BufferAlloc(Relation reln,
AddBufferToFreelist(buf);
buf->flags |= BM_FREE;
}
- buf->flags &= ~BM_IO_IN_PROGRESS;
}
PinBuffer(buf2);
@@ -619,8 +611,8 @@ BufferAlloc(Relation reln,
}
/* record the database name and relation name for this buffer */
- strcpy(buf->sb_relname, RelationGetPhysicalRelationName(reln));
- strcpy(buf->sb_dbname, DatabaseName);
+ strcpy(buf->blind.dbname, DatabaseName);
+ strcpy(buf->blind.relname, RelationGetPhysicalRelationName(reln));
INIT_BUFFERTAG(&(buf->tag), reln, blockNum);
if (!BufTableInsert(buf))
@@ -683,9 +675,9 @@ WriteBuffer(Buffer buffer)
SpinAcquire(BufMgrLock);
Assert(bufHdr->refcount > 0);
bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+ SetBufferDirtiedByMe(buffer, bufHdr);
UnpinBuffer(bufHdr);
SpinRelease(BufMgrLock);
- CommitInfoNeedsSave[buffer - 1] = 0;
return TRUE;
}
@@ -702,7 +694,7 @@ WriteBuffer_Debug(char *file, int line, Buffer buffer)
buf = &BufferDescriptors[buffer - 1];
fprintf(stderr, "UNPIN(WR) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
- buffer, buf->sb_relname, buf->tag.blockNum,
+ buffer, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
}
@@ -767,8 +759,9 @@ DirtyBufferCopy(Oid dbid, Oid relid, BlockNumber blkno, char *dest)
*
* 'buffer' is known to be dirty/pinned, so there should not be a
* problem reading the BufferDesc members without the BufMgrLock
- * (nobody should be able to change tags, flags, etc. out from under
- * us). Unpin if 'release' is TRUE.
+ * (nobody should be able to change tags out from under us).
+ *
+ * Unpin if 'release' is TRUE.
*/
int
FlushBuffer(Buffer buffer, bool release)
@@ -784,6 +777,8 @@ FlushBuffer(Buffer buffer, bool release)
if (BAD_BUFFER_ID(buffer))
return STATUS_ERROR;
+ Assert(PrivateRefCount[buffer - 1] > 0); /* else caller didn't pin */
+
bufHdr = &BufferDescriptors[buffer - 1];
bufdb = bufHdr->tag.relId.dbId;
@@ -809,7 +804,7 @@ FlushBuffer(Buffer buffer, bool release)
if (status == SM_FAIL)
{
elog(ERROR, "FlushBuffer: cannot flush block %u of the relation %s",
- bufHdr->tag.blockNum, bufHdr->sb_relname);
+ bufHdr->tag.blockNum, bufHdr->blind.relname);
return STATUS_ERROR;
}
BufferFlushCount++;
@@ -820,19 +815,21 @@ FlushBuffer(Buffer buffer, bool release)
/*
* If this buffer was marked by someone as DIRTY while we were
- * flushing it out we must not clear DIRTY flag - vadim 01/17/97
+ * flushing it out we must not clear shared DIRTY flag - vadim 01/17/97
+ *
+ * ... but we can clear BufferDirtiedByMe anyway - tgl 3/31/00
*/
if (bufHdr->flags & BM_JUST_DIRTIED)
{
elog(NOTICE, "FlushBuffer: content of block %u (%s) changed while flushing",
- bufHdr->tag.blockNum, bufHdr->sb_relname);
+ bufHdr->tag.blockNum, bufHdr->blind.relname);
}
else
bufHdr->flags &= ~BM_DIRTY;
+ ClearBufferDirtiedByMe(buffer, bufHdr);
if (release)
UnpinBuffer(bufHdr);
SpinRelease(BufMgrLock);
- CommitInfoNeedsSave[buffer - 1] = 0;
return STATUS_OK;
}
@@ -857,9 +854,10 @@ WriteNoReleaseBuffer(Buffer buffer)
SharedBufferChanged = true;
SpinAcquire(BufMgrLock);
+ Assert(bufHdr->refcount > 0);
bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+ SetBufferDirtiedByMe(buffer, bufHdr);
SpinRelease(BufMgrLock);
- CommitInfoNeedsSave[buffer - 1] = 0;
return STATUS_OK;
}
@@ -901,11 +899,6 @@ ReleaseAndReadBuffer(Buffer buffer,
AddBufferToFreelist(bufHdr);
bufHdr->flags |= BM_FREE;
}
- if (CommitInfoNeedsSave[buffer - 1])
- {
- bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
- CommitInfoNeedsSave[buffer - 1] = 0;
- }
retbuf = ReadBufferWithBufferLock(relation, blockNum, true);
return retbuf;
}
@@ -916,12 +909,119 @@ ReleaseAndReadBuffer(Buffer buffer,
}
/*
+ * SetBufferDirtiedByMe -- mark a shared buffer as being dirtied by this xact
+ *
+ * This flag essentially remembers that we need to write and fsync this buffer
+ * before we can commit the transaction. The write might end up getting done
+ * by another backend, but we must do the fsync ourselves (else we could
+ * commit before the data actually reaches disk). We do not issue fsync
+ * instantly upon write; the storage manager keeps track of which files need
+ * to be fsync'd before commit can occur. A key aspect of this data structure
+ * is that we will be able to notify the storage manager that an fsync is
+ * needed even after another backend has done the physical write and replaced
+ * the buffer contents with something else!
+ *
+ * NB: we must be holding the bufmgr lock at entry, and the buffer must be
+ * pinned so that no other backend can take it away from us.
+ */
+static void
+SetBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr)
+{
+ BufferTag *tagLastDirtied = & BufferTagLastDirtied[buffer - 1];
+ Relation reln;
+ int status;
+
+ /*
+ * If the flag is already set, check to see whether the buffertag is
+ * the same. If not, some other backend already wrote the buffer data
+ * that we dirtied. We must tell the storage manager to make an fsync
+ * pending on that file before we can overwrite the old tag value.
+ */
+ if (BufferDirtiedByMe[buffer - 1])
+ {
+ if (bufHdr->tag.relId.dbId == tagLastDirtied->relId.dbId &&
+ bufHdr->tag.relId.relId == tagLastDirtied->relId.relId &&
+ bufHdr->tag.blockNum == tagLastDirtied->blockNum)
+ return; /* Same tag already dirtied, so no work */
+
+#ifndef OPTIMIZE_SINGLE
+ SpinRelease(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
+
+ reln = RelationIdCacheGetRelation(tagLastDirtied->relId.relId);
+
+ if (reln == (Relation) NULL)
+ {
+ status = smgrblindmarkdirty(DEFAULT_SMGR,
+ BufferBlindLastDirtied[buffer - 1].dbname,
+ BufferBlindLastDirtied[buffer - 1].relname,
+ tagLastDirtied->relId.dbId,
+ tagLastDirtied->relId.relId,
+ tagLastDirtied->blockNum);
+ }
+ else
+ {
+ status = smgrmarkdirty(DEFAULT_SMGR, reln,
+ tagLastDirtied->blockNum);
+ /* drop relcache refcnt incremented by RelationIdCacheGetRelation */
+ RelationDecrementReferenceCount(reln);
+ }
+ if (status == SM_FAIL)
+ {
+ elog(ERROR, "SetBufferDirtiedByMe: cannot mark %u for %s",
+ tagLastDirtied->blockNum,
+ BufferBlindLastDirtied[buffer - 1].relname);
+ }
+
+#ifndef OPTIMIZE_SINGLE
+ SpinAcquire(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
+
+ }
+
+ *tagLastDirtied = bufHdr->tag;
+ BufferBlindLastDirtied[buffer - 1] = bufHdr->blind;
+ BufferDirtiedByMe[buffer - 1] = true;
+}
+
+/*
+ * ClearBufferDirtiedByMe -- mark a shared buffer as no longer needing fsync
+ *
+ * If we write out a buffer ourselves, then the storage manager will set its
+ * needs-fsync flag for that file automatically, and so we can clear our own
+ * flag that says it needs to be done later.
+ *
+ * NB: we must be holding the bufmgr lock at entry.
+ */
+static void
+ClearBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr)
+{
+ BufferTag *tagLastDirtied = & BufferTagLastDirtied[buffer - 1];
+
+ /*
+ * Do *not* clear the flag if it refers to some other buffertag than
+ * the data we just wrote. This is unlikely, but possible if some
+ * other backend replaced the buffer contents since we set our flag.
+ */
+ if (bufHdr->tag.relId.dbId == tagLastDirtied->relId.dbId &&
+ bufHdr->tag.relId.relId == tagLastDirtied->relId.relId &&
+ bufHdr->tag.blockNum == tagLastDirtied->blockNum)
+ {
+ BufferDirtiedByMe[buffer - 1] = false;
+ }
+}
+
+/*
* BufferSync -- Flush all dirty buffers in the pool.
*
- * This is called at transaction commit time. It does the wrong thing,
- * right now. We should flush only our own changes to stable storage,
- * and we should obey the lock protocol on the buffer manager metadata
- * as we do it. Also, we need to be sure that no other transaction is
+ * This is called at transaction commit time. We find all buffers
+ * that have been dirtied by the current xact and flush them to disk.
+ * We do *not* flush dirty buffers that have been dirtied by other xacts.
+ * (This is a substantial change from pre-7.0 behavior.)
+ *
+ * OLD COMMENTS (do these still apply?)
+ *
+ * Also, we need to be sure that no other transaction is
* modifying the page as we flush it. This is only a problem for objects
* that use a non-two-phase locking protocol, like btree indices. For
* those objects, we would like to set a write lock for the duration of
@@ -936,21 +1036,49 @@ static void
BufferSync()
{
int i;
- Oid bufdb;
- Oid bufrel;
- Relation reln;
BufferDesc *bufHdr;
int status;
+ Relation reln;
+ bool didwrite;
- SpinAcquire(BufMgrLock);
for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++)
{
+ /* Ignore buffers that were not dirtied by me */
+ if (! BufferDirtiedByMe[i])
+ continue;
+
+ SpinAcquire(BufMgrLock);
+
+ /*
+ * We only need to write if the buffer is still dirty and still
+ * contains the same disk page that it contained when we dirtied it.
+ * Otherwise, someone else has already written our changes for us,
+ * and we need only fsync.
+ *
+ * (NOTE: it's still possible to do an unnecessary write, if other
+ * xacts have written and then re-dirtied the page since our last
+ * change to it. But that should be pretty uncommon, and there's
+ * no easy way to detect it anyway.)
+ */
+ reln = NULL;
+ didwrite = false;
if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
{
+ Oid bufdb;
+ Oid bufrel;
+
bufdb = bufHdr->tag.relId.dbId;
bufrel = bufHdr->tag.relId.relId;
- if (bufdb == MyDatabaseId || bufdb == (Oid) 0)
+ if (bufdb == BufferTagLastDirtied[i].relId.dbId &&
+ bufrel == BufferTagLastDirtied[i].relId.relId &&
+ bufHdr->tag.blockNum == BufferTagLastDirtied[i].blockNum)
{
+ /*
+ * Try to find relation for buf. This could fail, if the
+ * rel has been flushed from the relcache since we dirtied
+ * the page. That should be uncommon, so paying the extra
+ * cost of a blind write when it happens seems OK.
+ */
reln = RelationIdCacheGetRelation(bufrel);
/*
@@ -970,74 +1098,114 @@ BufferSync()
if (bufHdr->flags & BM_IO_ERROR)
{
elog(ERROR, "BufferSync: write error %u for %s",
- bufHdr->tag.blockNum, bufHdr->sb_relname);
+ bufHdr->tag.blockNum, bufHdr->blind.relname);
}
- /* drop refcnt from RelationIdCacheGetRelation */
- if (reln != (Relation) NULL)
- RelationDecrementReferenceCount(reln);
- continue;
- }
-
- /*
- * To check if block content changed while flushing (see
- * below). - vadim 01/17/97
- */
- WaitIO(bufHdr, BufMgrLock); /* confirm end of IO */
- bufHdr->flags &= ~BM_JUST_DIRTIED;
- StartBufferIO(bufHdr, false); /* output IO start */
-
- /*
- * If we didn't have the reldesc in our local cache, flush
- * this page out using the 'blind write' storage manager
- * routine. If we did find it, use the standard
- * interface.
- */
-
-#ifndef OPTIMIZE_SINGLE
- SpinRelease(BufMgrLock);
-#endif /* OPTIMIZE_SINGLE */
- if (reln == (Relation) NULL)
- {
- status = smgrblindwrt(DEFAULT_SMGR, bufHdr->sb_dbname,
- bufHdr->sb_relname, bufdb, bufrel,
- bufHdr->tag.blockNum,
- (char *) MAKE_PTR(bufHdr->data));
}
else
{
- status = smgrwrite(DEFAULT_SMGR, reln,
- bufHdr->tag.blockNum,
- (char *) MAKE_PTR(bufHdr->data));
- }
+ /*
+ * To check if block content changed while flushing (see
+ * below). - vadim 01/17/97
+ */
+ WaitIO(bufHdr, BufMgrLock); /* confirm end of IO */
+ bufHdr->flags &= ~BM_JUST_DIRTIED;
+ StartBufferIO(bufHdr, false); /* output IO start */
+
+ /*
+ * If we didn't have the reldesc in our local cache, write
+ * this page out using the 'blind write' storage manager
+ * routine. If we did find it, use the standard
+ * interface.
+ */
#ifndef OPTIMIZE_SINGLE
- SpinAcquire(BufMgrLock);
+ SpinRelease(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
+ if (reln == (Relation) NULL)
+ {
+ status = smgrblindwrt(DEFAULT_SMGR,
+ bufHdr->blind.dbname,
+ bufHdr->blind.relname,
+ bufdb, bufrel,
+ bufHdr->tag.blockNum,
+ (char *) MAKE_PTR(bufHdr->data));
+ }
+ else
+ {
+ status = smgrwrite(DEFAULT_SMGR, reln,
+ bufHdr->tag.blockNum,
+ (char *) MAKE_PTR(bufHdr->data));
+ }
+#ifndef OPTIMIZE_SINGLE
+ SpinAcquire(BufMgrLock);
#endif /* OPTIMIZE_SINGLE */
- UnpinBuffer(bufHdr);
- if (status == SM_FAIL)
- {
- bufHdr->flags |= BM_IO_ERROR;
- elog(ERROR, "BufferSync: cannot write %u for %s",
- bufHdr->tag.blockNum, bufHdr->sb_relname);
+ UnpinBuffer(bufHdr);
+ if (status == SM_FAIL)
+ {
+ bufHdr->flags |= BM_IO_ERROR;
+ elog(ERROR, "BufferSync: cannot write %u for %s",
+ bufHdr->tag.blockNum, bufHdr->blind.relname);
+ }
+ bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */
+ TerminateBufferIO(bufHdr); /* Sync IO finished */
+ BufferFlushCount++;
+ didwrite = true;
+
+ /*
+ * If this buffer was marked by someone as DIRTY while we
+ * were flushing it out we must not clear DIRTY flag -
+ * vadim 01/17/97
+ *
+ * but it is OK to clear BufferDirtiedByMe - tgl 3/31/00
+ */
+ if (!(bufHdr->flags & BM_JUST_DIRTIED))
+ bufHdr->flags &= ~BM_DIRTY;
}
- bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */
- TerminateBufferIO(bufHdr); /* Sync IO finished */
- BufferFlushCount++;
- /*
- * If this buffer was marked by someone as DIRTY while we
- * were flushing it out we must not clear DIRTY flag -
- * vadim 01/17/97
- */
- if (!(bufHdr->flags & BM_JUST_DIRTIED))
- bufHdr->flags &= ~BM_DIRTY;
- /* drop refcnt from RelationIdCacheGetRelation */
+ /* drop refcnt obtained by RelationIdCacheGetRelation */
if (reln != (Relation) NULL)
RelationDecrementReferenceCount(reln);
}
}
+
+ /*
+ * If we did not write the buffer (because someone else did),
+ * we must still fsync the file containing it, to ensure that the
+ * write is down to disk before we commit.
+ */
+ if (! didwrite)
+ {
+#ifndef OPTIMIZE_SINGLE
+ SpinRelease(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
+
+ reln = RelationIdCacheGetRelation(BufferTagLastDirtied[i].relId.relId);
+ if (reln == (Relation) NULL)
+ {
+ status = smgrblindmarkdirty(DEFAULT_SMGR,
+ BufferBlindLastDirtied[i].dbname,
+ BufferBlindLastDirtied[i].relname,
+ BufferTagLastDirtied[i].relId.dbId,
+ BufferTagLastDirtied[i].relId.relId,
+ BufferTagLastDirtied[i].blockNum);
+ }
+ else
+ {
+ status = smgrmarkdirty(DEFAULT_SMGR, reln,
+ BufferTagLastDirtied[i].blockNum);
+ /* drop relcache refcnt incremented by RelationIdCacheGetRelation */
+ RelationDecrementReferenceCount(reln);
+
+ }
+#ifndef OPTIMIZE_SINGLE
+ SpinAcquire(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
+ }
+
+ BufferDirtiedByMe[i] = false;
+
+ SpinRelease(BufMgrLock);
}
- SpinRelease(BufMgrLock);
LocalBufferSync();
}
@@ -1166,13 +1334,19 @@ ResetBufferUsage()
/* ----------------------------------------------
* ResetBufferPool
*
- * this routine is supposed to be called when a transaction aborts.
+ * This routine is supposed to be called when a transaction aborts.
* it will release all the buffer pins held by the transaction.
+ * Currently, we also call it during commit if BufferPoolCheckLeak
+ * detected a problem --- in that case, isCommit is TRUE, and we
+ * only clean up buffer pin counts.
+ *
+ * During abort, we also forget any pending fsync requests. Dirtied buffers
+ * will still get written, eventually, but there will be no fsync for them.
*
* ----------------------------------------------
*/
void
-ResetBufferPool()
+ResetBufferPool(bool isCommit)
{
int i;
@@ -1193,10 +1367,15 @@ ResetBufferPool()
SpinRelease(BufMgrLock);
}
PrivateRefCount[i] = 0;
- CommitInfoNeedsSave[i] = 0;
+
+ if (! isCommit)
+ BufferDirtiedByMe[i] = false;
}
ResetLocalBufferPool();
+
+ if (! isCommit)
+ smgrabort();
}
/* -----------------------------------------------
@@ -1222,7 +1401,7 @@ BufferPoolCheckLeak()
"Buffer Leak: [%03d] (freeNext=%ld, freePrev=%ld, \
relname=%s, blockNum=%d, flags=0x%x, refcount=%d %ld)",
i - 1, buf->freeNext, buf->freePrev,
- buf->sb_relname, buf->tag.blockNum, buf->flags,
+ buf->blind.relname, buf->tag.blockNum, buf->flags,
buf->refcount, PrivateRefCount[i - 1]);
result = 1;
}
@@ -1306,25 +1485,25 @@ BufferGetRelation(Buffer buffer)
/*
* BufferReplace
*
- * Flush the buffer corresponding to 'bufHdr'
+ * Write out the buffer corresponding to 'bufHdr'
*
+ * This routine used to flush the data to disk (ie, force immediate fsync)
+ * but that's no longer necessary because BufferSync is smarter than before.
+ *
+ * BufMgrLock must be held at entry, and the buffer must be pinned.
*/
static int
-BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld)
+BufferReplace(BufferDesc *bufHdr)
{
Relation reln;
Oid bufdb,
bufrel;
int status;
- if (!bufferLockHeld)
- SpinAcquire(BufMgrLock);
-
/*
* first try to find the reldesc in the cache, if no luck, don't
* bother to build the reldesc from scratch, just do a blind write.
*/
-
bufdb = bufHdr->tag.relId.dbId;
bufrel = bufHdr->tag.relId.relId;
@@ -1336,22 +1515,27 @@ BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld)
/* To check if block content changed while flushing. - vadim 01/17/97 */
bufHdr->flags &= ~BM_JUST_DIRTIED;
+#ifndef OPTIMIZE_SINGLE
SpinRelease(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
if (reln != (Relation) NULL)
{
- status = smgrflush(DEFAULT_SMGR, reln, bufHdr->tag.blockNum,
+ status = smgrwrite(DEFAULT_SMGR, reln, bufHdr->tag.blockNum,
(char *) MAKE_PTR(bufHdr->data));
}
else
{
- /* blind write always flushes */
- status = smgrblindwrt(DEFAULT_SMGR, bufHdr->sb_dbname,
- bufHdr->sb_relname, bufdb, bufrel,
+ status = smgrblindwrt(DEFAULT_SMGR, bufHdr->blind.dbname,
+ bufHdr->blind.relname, bufdb, bufrel,
bufHdr->tag.blockNum,
(char *) MAKE_PTR(bufHdr->data));
}
+#ifndef OPTIMIZE_SINGLE
+ SpinAcquire(BufMgrLock);
+#endif /* OPTIMIZE_SINGLE */
+
/* drop relcache refcnt incremented by RelationIdCacheGetRelation */
if (reln != (Relation) NULL)
RelationDecrementReferenceCount(reln);
@@ -1359,6 +1543,11 @@ BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld)
if (status == SM_FAIL)
return FALSE;
+ /* If we had marked this buffer as needing to be fsync'd, we can forget
+ * about that, because it's now the storage manager's responsibility.
+ */
+ ClearBufferDirtiedByMe(BufferDescriptorGetBuffer(bufHdr), bufHdr);
+
BufferFlushCount++;
return TRUE;
@@ -1440,7 +1629,7 @@ ReleaseRelationBuffers(Relation rel)
}
/* Now we can do what we came for */
buf->flags &= ~ ( BM_DIRTY | BM_JUST_DIRTIED);
- CommitInfoNeedsSave[i - 1] = 0;
+ ClearBufferDirtiedByMe(i, buf);
/*
* Release any refcount we may have.
*
@@ -1502,6 +1691,7 @@ DropBuffers(Oid dbid)
}
/* Now we can do what we came for */
buf->flags &= ~ ( BM_DIRTY | BM_JUST_DIRTIED);
+ ClearBufferDirtiedByMe(i, buf);
/*
* The thing should be free, if caller has checked that
* no backends are running in that database.
@@ -1533,7 +1723,7 @@ PrintBufferDescs()
elog(DEBUG, "[%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \
blockNum=%d, flags=0x%x, refcount=%d %ld)",
i, buf->freeNext, buf->freePrev,
- buf->sb_relname, buf->tag.blockNum, buf->flags,
+ buf->blind.relname, buf->tag.blockNum, buf->flags,
buf->refcount, PrivateRefCount[i]);
}
SpinRelease(BufMgrLock);
@@ -1544,7 +1734,7 @@ blockNum=%d, flags=0x%x, refcount=%d %ld)",
for (i = 0; i < NBuffers; ++i, ++buf)
{
printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld)\n",
- i, buf->sb_relname, buf->tag.blockNum,
+ i, buf->blind.relname, buf->tag.blockNum,
buf->flags, buf->refcount, PrivateRefCount[i]);
}
}
@@ -1562,7 +1752,7 @@ PrintPinnedBufs()
if (PrivateRefCount[i] > 0)
elog(NOTICE, "[%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \
blockNum=%d, flags=0x%x, refcount=%d %ld)\n",
- i, buf->freeNext, buf->freePrev, buf->sb_relname,
+ i, buf->freeNext, buf->freePrev, buf->blind.relname,
buf->tag.blockNum, buf->flags,
buf->refcount, PrivateRefCount[i]);
}
@@ -1601,33 +1791,42 @@ BufferPoolBlowaway()
* FlushRelationBuffers
*
* This function removes from the buffer pool all pages of a relation
- * that have blocknumber >= specified block. If doFlush is true,
- * dirty buffers are written out --- otherwise it's an error for any
- * of the buffers to be dirty.
+ * that have blocknumber >= specified block. Pages that are dirty are
+ * written out first. If expectDirty is false, a notice is emitted
+ * warning of dirty buffers, but we proceed anyway. An error code is
+ * returned if we fail to dump a dirty buffer or if we find one of
+ * the target pages is pinned into the cache.
*
* This is used by VACUUM before truncating the relation to the given
- * number of blocks. For VACUUM, we pass doFlush = false since it would
- * mean a bug in VACUUM if any of the unwanted pages were still dirty.
- * (TRUNCATE TABLE also uses it in the same way.)
+ * number of blocks. For VACUUM, we pass expectDirty = false since it
+ * could mean a bug in VACUUM if any of the unwanted pages were still
+ * dirty. (TRUNCATE TABLE also uses it in the same way.)
*
- * This is also used by RENAME TABLE (with block = 0 and doFlush = true)
+ * This is also used by RENAME TABLE (with block=0 and expectDirty=true)
* to clear out the buffer cache before renaming the physical files of
* a relation. Without that, some other backend might try to do a
- * blind write of a buffer page (relying on the sb_relname of the buffer)
+ * blind write of a buffer page (relying on the BlindId of the buffer)
* and fail because it's not got the right filename anymore.
*
* In both cases, the caller should be holding AccessExclusiveLock on
* the target relation to ensure that no other backend is busy reading
- * more blocks of the relation...
+ * more blocks of the relation.
+ *
+ * Formerly, we considered it an error condition if we found unexpectedly
+ * dirty buffers. However, since BufferSync no longer forces out all
+ * dirty buffers at every xact commit, it's possible for dirty buffers
+ * to still be present in the cache due to failure of an earlier
+ * transaction. So, downgrade the error to a mere notice. Maybe we
+ * shouldn't even emit a notice...
*
- * Returns: 0 - Ok, -1 - DIRTY, -2 - PINNED
+ * Returns: 0 - Ok, -1 - FAILED TO WRITE DIRTY BUFFER, -2 - PINNED
*
* XXX currently it sequentially searches the buffer pool, should be
* changed to more clever ways of searching.
* --------------------------------------------------------------------
*/
int
-FlushRelationBuffers(Relation rel, BlockNumber block, bool doFlush)
+FlushRelationBuffers(Relation rel, BlockNumber block, bool expectDirty)
{
int i;
BufferDesc *buf;
@@ -1642,21 +1841,15 @@ FlushRelationBuffers(Relation rel, BlockNumber block, bool doFlush)
{
if (buf->flags & BM_DIRTY)
{
- if (doFlush)
- {
- if (FlushBuffer(-i-1, false) != STATUS_OK)
- {
- elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it",
- RelationGetRelationName(rel),
- block, buf->tag.blockNum);
- return -1;
- }
- }
- else
- {
+ if (! expectDirty)
elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty",
RelationGetRelationName(rel),
block, buf->tag.blockNum);
+ if (FlushBuffer(-i-1, false) != STATUS_OK)
+ {
+ elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it",
+ RelationGetRelationName(rel),
+ block, buf->tag.blockNum);
return -1;
}
}
@@ -1676,39 +1869,42 @@ FlushRelationBuffers(Relation rel, BlockNumber block, bool doFlush)
SpinAcquire(BufMgrLock);
for (i = 0; i < NBuffers; i++)
{
+ recheck:
buf = &BufferDescriptors[i];
- if (buf->tag.relId.dbId == MyDatabaseId &&
- buf->tag.relId.relId == RelationGetRelid(rel) &&
+ if (buf->tag.relId.relId == RelationGetRelid(rel) &&
+ (buf->tag.relId.dbId == MyDatabaseId ||
+ buf->tag.relId.dbId == (Oid) NULL) &&
buf->tag.blockNum >= block)
{
if (buf->flags & BM_DIRTY)
{
- if (doFlush)
- {
- SpinRelease(BufMgrLock);
- if (FlushBuffer(i+1, false) != STATUS_OK)
- {
- elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d), could not flush it",
- buf->sb_relname, block, buf->tag.blockNum,
- PrivateRefCount[i], buf->refcount);
- return -1;
- }
- SpinAcquire(BufMgrLock);
- }
- else
- {
- SpinRelease(BufMgrLock);
+ PinBuffer(buf);
+ SpinRelease(BufMgrLock);
+ if (! expectDirty)
elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d)",
- buf->sb_relname, block, buf->tag.blockNum,
+ RelationGetRelationName(rel), block,
+ buf->tag.blockNum,
+ PrivateRefCount[i], buf->refcount);
+ if (FlushBuffer(i+1, true) != STATUS_OK)
+ {
+ elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d), could not flush it",
+ RelationGetRelationName(rel), block,
+ buf->tag.blockNum,
PrivateRefCount[i], buf->refcount);
return -1;
}
+ SpinAcquire(BufMgrLock);
+ /* Buffer could already be reassigned, so must recheck
+ * whether it still belongs to rel before freeing it!
+ */
+ goto recheck;
}
if (!(buf->flags & BM_FREE))
{
SpinRelease(BufMgrLock);
elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is referenced (private %ld, global %d)",
- buf->sb_relname, block, buf->tag.blockNum,
+ RelationGetRelationName(rel), block,
+ buf->tag.blockNum,
PrivateRefCount[i], buf->refcount);
return -2;
}
@@ -1755,11 +1951,6 @@ ReleaseBuffer(Buffer buffer)
AddBufferToFreelist(bufHdr);
bufHdr->flags |= BM_FREE;
}
- if (CommitInfoNeedsSave[buffer - 1])
- {
- bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
- CommitInfoNeedsSave[buffer - 1] = 0;
- }
SpinRelease(BufMgrLock);
}
@@ -1777,7 +1968,7 @@ IncrBufferRefCount_Debug(char *file, int line, Buffer buffer)
fprintf(stderr, "PIN(Incr) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
- buffer, buf->sb_relname, buf->tag.blockNum,
+ buffer, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
}
@@ -1795,7 +1986,7 @@ ReleaseBuffer_Debug(char *file, int line, Buffer buffer)
fprintf(stderr, "UNPIN(Rel) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
- buffer, buf->sb_relname, buf->tag.blockNum,
+ buffer, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
}
@@ -1822,7 +2013,7 @@ ReleaseAndReadBuffer_Debug(char *file,
fprintf(stderr, "UNPIN(Rel&Rd) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
- buffer, buf->sb_relname, buf->tag.blockNum,
+ buffer, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer))
@@ -1831,7 +2022,7 @@ refcount = %ld, file: %s, line: %d\n",
fprintf(stderr, "PIN(Rel&Rd) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
- b, buf->sb_relname, buf->tag.blockNum,
+ b, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[b - 1], file, line);
}
return b;
@@ -1983,11 +2174,43 @@ _bm_die(Oid dbId, Oid relId, int blkNo, int bufNo,
#endif /* BMTRACE */
+/*
+ * SetBufferCommitInfoNeedsSave
+ *
+ * Mark a buffer dirty when we have updated tuple commit-status bits in it.
+ *
+ * This is similar to WriteNoReleaseBuffer, except that we do not set
+ * SharedBufferChanged or BufferDirtiedByMe, because we have not made a
+ * critical change that has to be flushed to disk before xact commit --- the
+ * status-bit update could be redone by someone else just as easily. The
+ * buffer will be marked dirty, but it will not be written to disk until
+ * there is another reason to write it.
+ *
+ * This routine might get called many times on the same page, if we are making
+ * the first scan after commit of an xact that added/deleted many tuples.
+ * So, be as quick as we can if the buffer is already dirty.
+ */
void
SetBufferCommitInfoNeedsSave(Buffer buffer)
{
- if (!BufferIsLocal(buffer))
- CommitInfoNeedsSave[buffer - 1]++;
+ BufferDesc *bufHdr;
+
+ if (BufferIsLocal(buffer))
+ return;
+
+ if (BAD_BUFFER_ID(buffer))
+ return;
+
+ bufHdr = &BufferDescriptors[buffer - 1];
+
+ if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
+ (BM_DIRTY | BM_JUST_DIRTIED))
+ {
+ SpinAcquire(BufMgrLock);
+ Assert(bufHdr->refcount > 0);
+ bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+ SpinRelease(BufMgrLock);
+ }
}
void
@@ -2175,7 +2398,16 @@ static void StartBufferIO(BufferDesc *buf, bool forInput)
Assert(!(buf->flags & BM_IO_IN_PROGRESS));
buf->flags |= BM_IO_IN_PROGRESS;
#ifdef HAS_TEST_AND_SET
- Assert(S_LOCK_FREE(&(buf->io_in_progress_lock)))
+ /*
+ * There used to be
+ *
+ * Assert(S_LOCK_FREE(&(buf->io_in_progress_lock)));
+ *
+ * here, but that's wrong because of the way WaitIO works: someone else
+ * waiting for the I/O to complete will succeed in grabbing the lock for
+ * a few instructions, and if we context-swap back to here the Assert
+ * could fail. Tiny window for failure, but I've seen it happen -- tgl
+ */
S_LOCK(&(buf->io_in_progress_lock));
#endif /* HAS_TEST_AND_SET */
InProgressBuf = buf;
@@ -2217,7 +2449,7 @@ static void ContinueBufferIO(BufferDesc *buf, bool forInput)
IsForInput = forInput;
}
-extern void InitBufferIO(void)
+void InitBufferIO(void)
{
InProgressBuf = (BufferDesc *)0;
}
@@ -2229,7 +2461,7 @@ extern void InitBufferIO(void)
* set in case of output,this routine would kill all
* backends and reset postmaster.
*/
-extern void AbortBufferIO(void)
+void AbortBufferIO(void)
{
BufferDesc *buf = InProgressBuf;
if (buf)
@@ -2252,8 +2484,8 @@ extern void AbortBufferIO(void)
buf->flags |= BM_DIRTY;
}
buf->flags |= BM_IO_ERROR;
- TerminateBufferIO(buf);
buf->flags &= ~BM_IO_IN_PROGRESS;
+ TerminateBufferIO(buf);
SpinRelease(BufMgrLock);
}
}