diff options
Diffstat (limited to 'src/backend/storage/buffer/bufmgr.c')
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 606 |
1 files changed, 419 insertions, 187 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index acc719ca4b6..0887f3d1ecd 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.77 2000/03/31 02:43:31 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.78 2000/04/09 04:43:18 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -94,8 +94,10 @@ static Buffer ReadBufferWithBufferLock(Relation relation, BlockNumber blockNum, bool bufferLockHeld); static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr, bool bufferLockHeld); +static void SetBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr); +static void ClearBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr); static void BufferSync(void); -static int BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld); +static int BufferReplace(BufferDesc *bufHdr); void PrintBufferDescs(void); /* --------------------------------------------------- @@ -176,7 +178,7 @@ is_userbuffer(Buffer buffer) { BufferDesc *buf = &BufferDescriptors[buffer - 1]; - if (IsSystemRelationName(buf->sb_relname)) + if (IsSystemRelationName(buf->blind.relname)) return false; return true; } @@ -199,7 +201,7 @@ ReadBuffer_Debug(char *file, fprintf(stderr, "PIN(RD) %ld relname = %s, blockNum = %d, \ refcount = %ld, file: %s, line: %d\n", - buffer, buf->sb_relname, buf->tag.blockNum, + buffer, buf->blind.relname, buf->tag.blockNum, PrivateRefCount[buffer - 1], file, line); } return buffer; @@ -390,22 +392,21 @@ BufferAlloc(Relation reln, * If there's no IO for the buffer and the buffer * is BROKEN,it should be read again. So start a * new buffer IO here. - - * - * wierd race condition: - * - * We were waiting for someone else to read the buffer. While - * we were waiting, the reader boof'd in some way, so the - * contents of the buffer are still invalid. By saying - * that we didn't find it, we can make the caller - * reinitialize the buffer. If two processes are waiting - * for this block, both will read the block. The second - * one to finish may overwrite any updates made by the - * first. (Assume higher level synchronization prevents - * this from happening). - * - * This is never going to happen, don't worry about it. - */ + * + * wierd race condition: + * + * We were waiting for someone else to read the buffer. While + * we were waiting, the reader boof'd in some way, so the + * contents of the buffer are still invalid. By saying + * that we didn't find it, we can make the caller + * reinitialize the buffer. If two processes are waiting + * for this block, both will read the block. The second + * one to finish may overwrite any updates made by the + * first. (Assume higher level synchronization prevents + * this from happening). + * + * This is never going to happen, don't worry about it. + */ *foundPtr = FALSE; } #ifdef BMTRACE @@ -465,33 +466,24 @@ BufferAlloc(Relation reln, * in WaitIO until we're done. */ inProgress = TRUE; -#ifdef HAS_TEST_AND_SET /* * All code paths that acquire this lock pin the buffer first; * since no one had it pinned (it just came off the free * list), no one else can have this lock. */ -#endif /* HAS_TEST_AND_SET */ StartBufferIO(buf, false); /* * Write the buffer out, being careful to release BufMgrLock * before starting the I/O. - * - * This #ifndef is here because a few extra semops REALLY kill - * you on machines that don't have spinlocks. If you don't - * operate with much concurrency, well... */ - smok = BufferReplace(buf, true); -#ifndef OPTIMIZE_SINGLE - SpinAcquire(BufMgrLock); -#endif /* OPTIMIZE_SINGLE */ + smok = BufferReplace(buf); if (smok == FALSE) { elog(NOTICE, "BufferAlloc: cannot write block %u for %s/%s", - buf->tag.blockNum, buf->sb_dbname, buf->sb_relname); + buf->tag.blockNum, buf->blind.dbname, buf->blind.relname); inProgress = FALSE; buf->flags |= BM_IO_ERROR; buf->flags &= ~BM_IO_IN_PROGRESS; @@ -516,7 +508,7 @@ BufferAlloc(Relation reln, if (buf->flags & BM_JUST_DIRTIED) { elog(FATAL, "BufferAlloc: content of block %u (%s) changed while flushing", - buf->tag.blockNum, buf->sb_relname); + buf->tag.blockNum, buf->blind.relname); } else buf->flags &= ~BM_DIRTY; @@ -562,6 +554,7 @@ BufferAlloc(Relation reln, */ if (buf != NULL) { + buf->flags &= ~BM_IO_IN_PROGRESS; TerminateBufferIO(buf); /* give up the buffer since we don't need it any more */ PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; @@ -572,7 +565,6 @@ BufferAlloc(Relation reln, AddBufferToFreelist(buf); buf->flags |= BM_FREE; } - buf->flags &= ~BM_IO_IN_PROGRESS; } PinBuffer(buf2); @@ -619,8 +611,8 @@ BufferAlloc(Relation reln, } /* record the database name and relation name for this buffer */ - strcpy(buf->sb_relname, RelationGetPhysicalRelationName(reln)); - strcpy(buf->sb_dbname, DatabaseName); + strcpy(buf->blind.dbname, DatabaseName); + strcpy(buf->blind.relname, RelationGetPhysicalRelationName(reln)); INIT_BUFFERTAG(&(buf->tag), reln, blockNum); if (!BufTableInsert(buf)) @@ -683,9 +675,9 @@ WriteBuffer(Buffer buffer) SpinAcquire(BufMgrLock); Assert(bufHdr->refcount > 0); bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); + SetBufferDirtiedByMe(buffer, bufHdr); UnpinBuffer(bufHdr); SpinRelease(BufMgrLock); - CommitInfoNeedsSave[buffer - 1] = 0; return TRUE; } @@ -702,7 +694,7 @@ WriteBuffer_Debug(char *file, int line, Buffer buffer) buf = &BufferDescriptors[buffer - 1]; fprintf(stderr, "UNPIN(WR) %ld relname = %s, blockNum = %d, \ refcount = %ld, file: %s, line: %d\n", - buffer, buf->sb_relname, buf->tag.blockNum, + buffer, buf->blind.relname, buf->tag.blockNum, PrivateRefCount[buffer - 1], file, line); } } @@ -767,8 +759,9 @@ DirtyBufferCopy(Oid dbid, Oid relid, BlockNumber blkno, char *dest) * * 'buffer' is known to be dirty/pinned, so there should not be a * problem reading the BufferDesc members without the BufMgrLock - * (nobody should be able to change tags, flags, etc. out from under - * us). Unpin if 'release' is TRUE. + * (nobody should be able to change tags out from under us). + * + * Unpin if 'release' is TRUE. */ int FlushBuffer(Buffer buffer, bool release) @@ -784,6 +777,8 @@ FlushBuffer(Buffer buffer, bool release) if (BAD_BUFFER_ID(buffer)) return STATUS_ERROR; + Assert(PrivateRefCount[buffer - 1] > 0); /* else caller didn't pin */ + bufHdr = &BufferDescriptors[buffer - 1]; bufdb = bufHdr->tag.relId.dbId; @@ -809,7 +804,7 @@ FlushBuffer(Buffer buffer, bool release) if (status == SM_FAIL) { elog(ERROR, "FlushBuffer: cannot flush block %u of the relation %s", - bufHdr->tag.blockNum, bufHdr->sb_relname); + bufHdr->tag.blockNum, bufHdr->blind.relname); return STATUS_ERROR; } BufferFlushCount++; @@ -820,19 +815,21 @@ FlushBuffer(Buffer buffer, bool release) /* * If this buffer was marked by someone as DIRTY while we were - * flushing it out we must not clear DIRTY flag - vadim 01/17/97 + * flushing it out we must not clear shared DIRTY flag - vadim 01/17/97 + * + * ... but we can clear BufferDirtiedByMe anyway - tgl 3/31/00 */ if (bufHdr->flags & BM_JUST_DIRTIED) { elog(NOTICE, "FlushBuffer: content of block %u (%s) changed while flushing", - bufHdr->tag.blockNum, bufHdr->sb_relname); + bufHdr->tag.blockNum, bufHdr->blind.relname); } else bufHdr->flags &= ~BM_DIRTY; + ClearBufferDirtiedByMe(buffer, bufHdr); if (release) UnpinBuffer(bufHdr); SpinRelease(BufMgrLock); - CommitInfoNeedsSave[buffer - 1] = 0; return STATUS_OK; } @@ -857,9 +854,10 @@ WriteNoReleaseBuffer(Buffer buffer) SharedBufferChanged = true; SpinAcquire(BufMgrLock); + Assert(bufHdr->refcount > 0); bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); + SetBufferDirtiedByMe(buffer, bufHdr); SpinRelease(BufMgrLock); - CommitInfoNeedsSave[buffer - 1] = 0; return STATUS_OK; } @@ -901,11 +899,6 @@ ReleaseAndReadBuffer(Buffer buffer, AddBufferToFreelist(bufHdr); bufHdr->flags |= BM_FREE; } - if (CommitInfoNeedsSave[buffer - 1]) - { - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - CommitInfoNeedsSave[buffer - 1] = 0; - } retbuf = ReadBufferWithBufferLock(relation, blockNum, true); return retbuf; } @@ -916,12 +909,119 @@ ReleaseAndReadBuffer(Buffer buffer, } /* + * SetBufferDirtiedByMe -- mark a shared buffer as being dirtied by this xact + * + * This flag essentially remembers that we need to write and fsync this buffer + * before we can commit the transaction. The write might end up getting done + * by another backend, but we must do the fsync ourselves (else we could + * commit before the data actually reaches disk). We do not issue fsync + * instantly upon write; the storage manager keeps track of which files need + * to be fsync'd before commit can occur. A key aspect of this data structure + * is that we will be able to notify the storage manager that an fsync is + * needed even after another backend has done the physical write and replaced + * the buffer contents with something else! + * + * NB: we must be holding the bufmgr lock at entry, and the buffer must be + * pinned so that no other backend can take it away from us. + */ +static void +SetBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr) +{ + BufferTag *tagLastDirtied = & BufferTagLastDirtied[buffer - 1]; + Relation reln; + int status; + + /* + * If the flag is already set, check to see whether the buffertag is + * the same. If not, some other backend already wrote the buffer data + * that we dirtied. We must tell the storage manager to make an fsync + * pending on that file before we can overwrite the old tag value. + */ + if (BufferDirtiedByMe[buffer - 1]) + { + if (bufHdr->tag.relId.dbId == tagLastDirtied->relId.dbId && + bufHdr->tag.relId.relId == tagLastDirtied->relId.relId && + bufHdr->tag.blockNum == tagLastDirtied->blockNum) + return; /* Same tag already dirtied, so no work */ + +#ifndef OPTIMIZE_SINGLE + SpinRelease(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + + reln = RelationIdCacheGetRelation(tagLastDirtied->relId.relId); + + if (reln == (Relation) NULL) + { + status = smgrblindmarkdirty(DEFAULT_SMGR, + BufferBlindLastDirtied[buffer - 1].dbname, + BufferBlindLastDirtied[buffer - 1].relname, + tagLastDirtied->relId.dbId, + tagLastDirtied->relId.relId, + tagLastDirtied->blockNum); + } + else + { + status = smgrmarkdirty(DEFAULT_SMGR, reln, + tagLastDirtied->blockNum); + /* drop relcache refcnt incremented by RelationIdCacheGetRelation */ + RelationDecrementReferenceCount(reln); + } + if (status == SM_FAIL) + { + elog(ERROR, "SetBufferDirtiedByMe: cannot mark %u for %s", + tagLastDirtied->blockNum, + BufferBlindLastDirtied[buffer - 1].relname); + } + +#ifndef OPTIMIZE_SINGLE + SpinAcquire(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + + } + + *tagLastDirtied = bufHdr->tag; + BufferBlindLastDirtied[buffer - 1] = bufHdr->blind; + BufferDirtiedByMe[buffer - 1] = true; +} + +/* + * ClearBufferDirtiedByMe -- mark a shared buffer as no longer needing fsync + * + * If we write out a buffer ourselves, then the storage manager will set its + * needs-fsync flag for that file automatically, and so we can clear our own + * flag that says it needs to be done later. + * + * NB: we must be holding the bufmgr lock at entry. + */ +static void +ClearBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr) +{ + BufferTag *tagLastDirtied = & BufferTagLastDirtied[buffer - 1]; + + /* + * Do *not* clear the flag if it refers to some other buffertag than + * the data we just wrote. This is unlikely, but possible if some + * other backend replaced the buffer contents since we set our flag. + */ + if (bufHdr->tag.relId.dbId == tagLastDirtied->relId.dbId && + bufHdr->tag.relId.relId == tagLastDirtied->relId.relId && + bufHdr->tag.blockNum == tagLastDirtied->blockNum) + { + BufferDirtiedByMe[buffer - 1] = false; + } +} + +/* * BufferSync -- Flush all dirty buffers in the pool. * - * This is called at transaction commit time. It does the wrong thing, - * right now. We should flush only our own changes to stable storage, - * and we should obey the lock protocol on the buffer manager metadata - * as we do it. Also, we need to be sure that no other transaction is + * This is called at transaction commit time. We find all buffers + * that have been dirtied by the current xact and flush them to disk. + * We do *not* flush dirty buffers that have been dirtied by other xacts. + * (This is a substantial change from pre-7.0 behavior.) + * + * OLD COMMENTS (do these still apply?) + * + * Also, we need to be sure that no other transaction is * modifying the page as we flush it. This is only a problem for objects * that use a non-two-phase locking protocol, like btree indices. For * those objects, we would like to set a write lock for the duration of @@ -936,21 +1036,49 @@ static void BufferSync() { int i; - Oid bufdb; - Oid bufrel; - Relation reln; BufferDesc *bufHdr; int status; + Relation reln; + bool didwrite; - SpinAcquire(BufMgrLock); for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) { + /* Ignore buffers that were not dirtied by me */ + if (! BufferDirtiedByMe[i]) + continue; + + SpinAcquire(BufMgrLock); + + /* + * We only need to write if the buffer is still dirty and still + * contains the same disk page that it contained when we dirtied it. + * Otherwise, someone else has already written our changes for us, + * and we need only fsync. + * + * (NOTE: it's still possible to do an unnecessary write, if other + * xacts have written and then re-dirtied the page since our last + * change to it. But that should be pretty uncommon, and there's + * no easy way to detect it anyway.) + */ + reln = NULL; + didwrite = false; if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY)) { + Oid bufdb; + Oid bufrel; + bufdb = bufHdr->tag.relId.dbId; bufrel = bufHdr->tag.relId.relId; - if (bufdb == MyDatabaseId || bufdb == (Oid) 0) + if (bufdb == BufferTagLastDirtied[i].relId.dbId && + bufrel == BufferTagLastDirtied[i].relId.relId && + bufHdr->tag.blockNum == BufferTagLastDirtied[i].blockNum) { + /* + * Try to find relation for buf. This could fail, if the + * rel has been flushed from the relcache since we dirtied + * the page. That should be uncommon, so paying the extra + * cost of a blind write when it happens seems OK. + */ reln = RelationIdCacheGetRelation(bufrel); /* @@ -970,74 +1098,114 @@ BufferSync() if (bufHdr->flags & BM_IO_ERROR) { elog(ERROR, "BufferSync: write error %u for %s", - bufHdr->tag.blockNum, bufHdr->sb_relname); + bufHdr->tag.blockNum, bufHdr->blind.relname); } - /* drop refcnt from RelationIdCacheGetRelation */ - if (reln != (Relation) NULL) - RelationDecrementReferenceCount(reln); - continue; - } - - /* - * To check if block content changed while flushing (see - * below). - vadim 01/17/97 - */ - WaitIO(bufHdr, BufMgrLock); /* confirm end of IO */ - bufHdr->flags &= ~BM_JUST_DIRTIED; - StartBufferIO(bufHdr, false); /* output IO start */ - - /* - * If we didn't have the reldesc in our local cache, flush - * this page out using the 'blind write' storage manager - * routine. If we did find it, use the standard - * interface. - */ - -#ifndef OPTIMIZE_SINGLE - SpinRelease(BufMgrLock); -#endif /* OPTIMIZE_SINGLE */ - if (reln == (Relation) NULL) - { - status = smgrblindwrt(DEFAULT_SMGR, bufHdr->sb_dbname, - bufHdr->sb_relname, bufdb, bufrel, - bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); } else { - status = smgrwrite(DEFAULT_SMGR, reln, - bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); - } + /* + * To check if block content changed while flushing (see + * below). - vadim 01/17/97 + */ + WaitIO(bufHdr, BufMgrLock); /* confirm end of IO */ + bufHdr->flags &= ~BM_JUST_DIRTIED; + StartBufferIO(bufHdr, false); /* output IO start */ + + /* + * If we didn't have the reldesc in our local cache, write + * this page out using the 'blind write' storage manager + * routine. If we did find it, use the standard + * interface. + */ #ifndef OPTIMIZE_SINGLE - SpinAcquire(BufMgrLock); + SpinRelease(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + if (reln == (Relation) NULL) + { + status = smgrblindwrt(DEFAULT_SMGR, + bufHdr->blind.dbname, + bufHdr->blind.relname, + bufdb, bufrel, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } + else + { + status = smgrwrite(DEFAULT_SMGR, reln, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } +#ifndef OPTIMIZE_SINGLE + SpinAcquire(BufMgrLock); #endif /* OPTIMIZE_SINGLE */ - UnpinBuffer(bufHdr); - if (status == SM_FAIL) - { - bufHdr->flags |= BM_IO_ERROR; - elog(ERROR, "BufferSync: cannot write %u for %s", - bufHdr->tag.blockNum, bufHdr->sb_relname); + UnpinBuffer(bufHdr); + if (status == SM_FAIL) + { + bufHdr->flags |= BM_IO_ERROR; + elog(ERROR, "BufferSync: cannot write %u for %s", + bufHdr->tag.blockNum, bufHdr->blind.relname); + } + bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */ + TerminateBufferIO(bufHdr); /* Sync IO finished */ + BufferFlushCount++; + didwrite = true; + + /* + * If this buffer was marked by someone as DIRTY while we + * were flushing it out we must not clear DIRTY flag - + * vadim 01/17/97 + * + * but it is OK to clear BufferDirtiedByMe - tgl 3/31/00 + */ + if (!(bufHdr->flags & BM_JUST_DIRTIED)) + bufHdr->flags &= ~BM_DIRTY; } - bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */ - TerminateBufferIO(bufHdr); /* Sync IO finished */ - BufferFlushCount++; - /* - * If this buffer was marked by someone as DIRTY while we - * were flushing it out we must not clear DIRTY flag - - * vadim 01/17/97 - */ - if (!(bufHdr->flags & BM_JUST_DIRTIED)) - bufHdr->flags &= ~BM_DIRTY; - /* drop refcnt from RelationIdCacheGetRelation */ + /* drop refcnt obtained by RelationIdCacheGetRelation */ if (reln != (Relation) NULL) RelationDecrementReferenceCount(reln); } } + + /* + * If we did not write the buffer (because someone else did), + * we must still fsync the file containing it, to ensure that the + * write is down to disk before we commit. + */ + if (! didwrite) + { +#ifndef OPTIMIZE_SINGLE + SpinRelease(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + + reln = RelationIdCacheGetRelation(BufferTagLastDirtied[i].relId.relId); + if (reln == (Relation) NULL) + { + status = smgrblindmarkdirty(DEFAULT_SMGR, + BufferBlindLastDirtied[i].dbname, + BufferBlindLastDirtied[i].relname, + BufferTagLastDirtied[i].relId.dbId, + BufferTagLastDirtied[i].relId.relId, + BufferTagLastDirtied[i].blockNum); + } + else + { + status = smgrmarkdirty(DEFAULT_SMGR, reln, + BufferTagLastDirtied[i].blockNum); + /* drop relcache refcnt incremented by RelationIdCacheGetRelation */ + RelationDecrementReferenceCount(reln); + + } +#ifndef OPTIMIZE_SINGLE + SpinAcquire(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + } + + BufferDirtiedByMe[i] = false; + + SpinRelease(BufMgrLock); } - SpinRelease(BufMgrLock); LocalBufferSync(); } @@ -1166,13 +1334,19 @@ ResetBufferUsage() /* ---------------------------------------------- * ResetBufferPool * - * this routine is supposed to be called when a transaction aborts. + * This routine is supposed to be called when a transaction aborts. * it will release all the buffer pins held by the transaction. + * Currently, we also call it during commit if BufferPoolCheckLeak + * detected a problem --- in that case, isCommit is TRUE, and we + * only clean up buffer pin counts. + * + * During abort, we also forget any pending fsync requests. Dirtied buffers + * will still get written, eventually, but there will be no fsync for them. * * ---------------------------------------------- */ void -ResetBufferPool() +ResetBufferPool(bool isCommit) { int i; @@ -1193,10 +1367,15 @@ ResetBufferPool() SpinRelease(BufMgrLock); } PrivateRefCount[i] = 0; - CommitInfoNeedsSave[i] = 0; + + if (! isCommit) + BufferDirtiedByMe[i] = false; } ResetLocalBufferPool(); + + if (! isCommit) + smgrabort(); } /* ----------------------------------------------- @@ -1222,7 +1401,7 @@ BufferPoolCheckLeak() "Buffer Leak: [%03d] (freeNext=%ld, freePrev=%ld, \ relname=%s, blockNum=%d, flags=0x%x, refcount=%d %ld)", i - 1, buf->freeNext, buf->freePrev, - buf->sb_relname, buf->tag.blockNum, buf->flags, + buf->blind.relname, buf->tag.blockNum, buf->flags, buf->refcount, PrivateRefCount[i - 1]); result = 1; } @@ -1306,25 +1485,25 @@ BufferGetRelation(Buffer buffer) /* * BufferReplace * - * Flush the buffer corresponding to 'bufHdr' + * Write out the buffer corresponding to 'bufHdr' * + * This routine used to flush the data to disk (ie, force immediate fsync) + * but that's no longer necessary because BufferSync is smarter than before. + * + * BufMgrLock must be held at entry, and the buffer must be pinned. */ static int -BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld) +BufferReplace(BufferDesc *bufHdr) { Relation reln; Oid bufdb, bufrel; int status; - if (!bufferLockHeld) - SpinAcquire(BufMgrLock); - /* * first try to find the reldesc in the cache, if no luck, don't * bother to build the reldesc from scratch, just do a blind write. */ - bufdb = bufHdr->tag.relId.dbId; bufrel = bufHdr->tag.relId.relId; @@ -1336,22 +1515,27 @@ BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld) /* To check if block content changed while flushing. - vadim 01/17/97 */ bufHdr->flags &= ~BM_JUST_DIRTIED; +#ifndef OPTIMIZE_SINGLE SpinRelease(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ if (reln != (Relation) NULL) { - status = smgrflush(DEFAULT_SMGR, reln, bufHdr->tag.blockNum, + status = smgrwrite(DEFAULT_SMGR, reln, bufHdr->tag.blockNum, (char *) MAKE_PTR(bufHdr->data)); } else { - /* blind write always flushes */ - status = smgrblindwrt(DEFAULT_SMGR, bufHdr->sb_dbname, - bufHdr->sb_relname, bufdb, bufrel, + status = smgrblindwrt(DEFAULT_SMGR, bufHdr->blind.dbname, + bufHdr->blind.relname, bufdb, bufrel, bufHdr->tag.blockNum, (char *) MAKE_PTR(bufHdr->data)); } +#ifndef OPTIMIZE_SINGLE + SpinAcquire(BufMgrLock); +#endif /* OPTIMIZE_SINGLE */ + /* drop relcache refcnt incremented by RelationIdCacheGetRelation */ if (reln != (Relation) NULL) RelationDecrementReferenceCount(reln); @@ -1359,6 +1543,11 @@ BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld) if (status == SM_FAIL) return FALSE; + /* If we had marked this buffer as needing to be fsync'd, we can forget + * about that, because it's now the storage manager's responsibility. + */ + ClearBufferDirtiedByMe(BufferDescriptorGetBuffer(bufHdr), bufHdr); + BufferFlushCount++; return TRUE; @@ -1440,7 +1629,7 @@ ReleaseRelationBuffers(Relation rel) } /* Now we can do what we came for */ buf->flags &= ~ ( BM_DIRTY | BM_JUST_DIRTIED); - CommitInfoNeedsSave[i - 1] = 0; + ClearBufferDirtiedByMe(i, buf); /* * Release any refcount we may have. * @@ -1502,6 +1691,7 @@ DropBuffers(Oid dbid) } /* Now we can do what we came for */ buf->flags &= ~ ( BM_DIRTY | BM_JUST_DIRTIED); + ClearBufferDirtiedByMe(i, buf); /* * The thing should be free, if caller has checked that * no backends are running in that database. @@ -1533,7 +1723,7 @@ PrintBufferDescs() elog(DEBUG, "[%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \ blockNum=%d, flags=0x%x, refcount=%d %ld)", i, buf->freeNext, buf->freePrev, - buf->sb_relname, buf->tag.blockNum, buf->flags, + buf->blind.relname, buf->tag.blockNum, buf->flags, buf->refcount, PrivateRefCount[i]); } SpinRelease(BufMgrLock); @@ -1544,7 +1734,7 @@ blockNum=%d, flags=0x%x, refcount=%d %ld)", for (i = 0; i < NBuffers; ++i, ++buf) { printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld)\n", - i, buf->sb_relname, buf->tag.blockNum, + i, buf->blind.relname, buf->tag.blockNum, buf->flags, buf->refcount, PrivateRefCount[i]); } } @@ -1562,7 +1752,7 @@ PrintPinnedBufs() if (PrivateRefCount[i] > 0) elog(NOTICE, "[%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \ blockNum=%d, flags=0x%x, refcount=%d %ld)\n", - i, buf->freeNext, buf->freePrev, buf->sb_relname, + i, buf->freeNext, buf->freePrev, buf->blind.relname, buf->tag.blockNum, buf->flags, buf->refcount, PrivateRefCount[i]); } @@ -1601,33 +1791,42 @@ BufferPoolBlowaway() * FlushRelationBuffers * * This function removes from the buffer pool all pages of a relation - * that have blocknumber >= specified block. If doFlush is true, - * dirty buffers are written out --- otherwise it's an error for any - * of the buffers to be dirty. + * that have blocknumber >= specified block. Pages that are dirty are + * written out first. If expectDirty is false, a notice is emitted + * warning of dirty buffers, but we proceed anyway. An error code is + * returned if we fail to dump a dirty buffer or if we find one of + * the target pages is pinned into the cache. * * This is used by VACUUM before truncating the relation to the given - * number of blocks. For VACUUM, we pass doFlush = false since it would - * mean a bug in VACUUM if any of the unwanted pages were still dirty. - * (TRUNCATE TABLE also uses it in the same way.) + * number of blocks. For VACUUM, we pass expectDirty = false since it + * could mean a bug in VACUUM if any of the unwanted pages were still + * dirty. (TRUNCATE TABLE also uses it in the same way.) * - * This is also used by RENAME TABLE (with block = 0 and doFlush = true) + * This is also used by RENAME TABLE (with block=0 and expectDirty=true) * to clear out the buffer cache before renaming the physical files of * a relation. Without that, some other backend might try to do a - * blind write of a buffer page (relying on the sb_relname of the buffer) + * blind write of a buffer page (relying on the BlindId of the buffer) * and fail because it's not got the right filename anymore. * * In both cases, the caller should be holding AccessExclusiveLock on * the target relation to ensure that no other backend is busy reading - * more blocks of the relation... + * more blocks of the relation. + * + * Formerly, we considered it an error condition if we found unexpectedly + * dirty buffers. However, since BufferSync no longer forces out all + * dirty buffers at every xact commit, it's possible for dirty buffers + * to still be present in the cache due to failure of an earlier + * transaction. So, downgrade the error to a mere notice. Maybe we + * shouldn't even emit a notice... * - * Returns: 0 - Ok, -1 - DIRTY, -2 - PINNED + * Returns: 0 - Ok, -1 - FAILED TO WRITE DIRTY BUFFER, -2 - PINNED * * XXX currently it sequentially searches the buffer pool, should be * changed to more clever ways of searching. * -------------------------------------------------------------------- */ int -FlushRelationBuffers(Relation rel, BlockNumber block, bool doFlush) +FlushRelationBuffers(Relation rel, BlockNumber block, bool expectDirty) { int i; BufferDesc *buf; @@ -1642,21 +1841,15 @@ FlushRelationBuffers(Relation rel, BlockNumber block, bool doFlush) { if (buf->flags & BM_DIRTY) { - if (doFlush) - { - if (FlushBuffer(-i-1, false) != STATUS_OK) - { - elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it", - RelationGetRelationName(rel), - block, buf->tag.blockNum); - return -1; - } - } - else - { + if (! expectDirty) elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty", RelationGetRelationName(rel), block, buf->tag.blockNum); + if (FlushBuffer(-i-1, false) != STATUS_OK) + { + elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it", + RelationGetRelationName(rel), + block, buf->tag.blockNum); return -1; } } @@ -1676,39 +1869,42 @@ FlushRelationBuffers(Relation rel, BlockNumber block, bool doFlush) SpinAcquire(BufMgrLock); for (i = 0; i < NBuffers; i++) { + recheck: buf = &BufferDescriptors[i]; - if (buf->tag.relId.dbId == MyDatabaseId && - buf->tag.relId.relId == RelationGetRelid(rel) && + if (buf->tag.relId.relId == RelationGetRelid(rel) && + (buf->tag.relId.dbId == MyDatabaseId || + buf->tag.relId.dbId == (Oid) NULL) && buf->tag.blockNum >= block) { if (buf->flags & BM_DIRTY) { - if (doFlush) - { - SpinRelease(BufMgrLock); - if (FlushBuffer(i+1, false) != STATUS_OK) - { - elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d), could not flush it", - buf->sb_relname, block, buf->tag.blockNum, - PrivateRefCount[i], buf->refcount); - return -1; - } - SpinAcquire(BufMgrLock); - } - else - { - SpinRelease(BufMgrLock); + PinBuffer(buf); + SpinRelease(BufMgrLock); + if (! expectDirty) elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d)", - buf->sb_relname, block, buf->tag.blockNum, + RelationGetRelationName(rel), block, + buf->tag.blockNum, + PrivateRefCount[i], buf->refcount); + if (FlushBuffer(i+1, true) != STATUS_OK) + { + elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d), could not flush it", + RelationGetRelationName(rel), block, + buf->tag.blockNum, PrivateRefCount[i], buf->refcount); return -1; } + SpinAcquire(BufMgrLock); + /* Buffer could already be reassigned, so must recheck + * whether it still belongs to rel before freeing it! + */ + goto recheck; } if (!(buf->flags & BM_FREE)) { SpinRelease(BufMgrLock); elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is referenced (private %ld, global %d)", - buf->sb_relname, block, buf->tag.blockNum, + RelationGetRelationName(rel), block, + buf->tag.blockNum, PrivateRefCount[i], buf->refcount); return -2; } @@ -1755,11 +1951,6 @@ ReleaseBuffer(Buffer buffer) AddBufferToFreelist(bufHdr); bufHdr->flags |= BM_FREE; } - if (CommitInfoNeedsSave[buffer - 1]) - { - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - CommitInfoNeedsSave[buffer - 1] = 0; - } SpinRelease(BufMgrLock); } @@ -1777,7 +1968,7 @@ IncrBufferRefCount_Debug(char *file, int line, Buffer buffer) fprintf(stderr, "PIN(Incr) %ld relname = %s, blockNum = %d, \ refcount = %ld, file: %s, line: %d\n", - buffer, buf->sb_relname, buf->tag.blockNum, + buffer, buf->blind.relname, buf->tag.blockNum, PrivateRefCount[buffer - 1], file, line); } } @@ -1795,7 +1986,7 @@ ReleaseBuffer_Debug(char *file, int line, Buffer buffer) fprintf(stderr, "UNPIN(Rel) %ld relname = %s, blockNum = %d, \ refcount = %ld, file: %s, line: %d\n", - buffer, buf->sb_relname, buf->tag.blockNum, + buffer, buf->blind.relname, buf->tag.blockNum, PrivateRefCount[buffer - 1], file, line); } } @@ -1822,7 +2013,7 @@ ReleaseAndReadBuffer_Debug(char *file, fprintf(stderr, "UNPIN(Rel&Rd) %ld relname = %s, blockNum = %d, \ refcount = %ld, file: %s, line: %d\n", - buffer, buf->sb_relname, buf->tag.blockNum, + buffer, buf->blind.relname, buf->tag.blockNum, PrivateRefCount[buffer - 1], file, line); } if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) @@ -1831,7 +2022,7 @@ refcount = %ld, file: %s, line: %d\n", fprintf(stderr, "PIN(Rel&Rd) %ld relname = %s, blockNum = %d, \ refcount = %ld, file: %s, line: %d\n", - b, buf->sb_relname, buf->tag.blockNum, + b, buf->blind.relname, buf->tag.blockNum, PrivateRefCount[b - 1], file, line); } return b; @@ -1983,11 +2174,43 @@ _bm_die(Oid dbId, Oid relId, int blkNo, int bufNo, #endif /* BMTRACE */ +/* + * SetBufferCommitInfoNeedsSave + * + * Mark a buffer dirty when we have updated tuple commit-status bits in it. + * + * This is similar to WriteNoReleaseBuffer, except that we do not set + * SharedBufferChanged or BufferDirtiedByMe, because we have not made a + * critical change that has to be flushed to disk before xact commit --- the + * status-bit update could be redone by someone else just as easily. The + * buffer will be marked dirty, but it will not be written to disk until + * there is another reason to write it. + * + * This routine might get called many times on the same page, if we are making + * the first scan after commit of an xact that added/deleted many tuples. + * So, be as quick as we can if the buffer is already dirty. + */ void SetBufferCommitInfoNeedsSave(Buffer buffer) { - if (!BufferIsLocal(buffer)) - CommitInfoNeedsSave[buffer - 1]++; + BufferDesc *bufHdr; + + if (BufferIsLocal(buffer)) + return; + + if (BAD_BUFFER_ID(buffer)) + return; + + bufHdr = &BufferDescriptors[buffer - 1]; + + if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) != + (BM_DIRTY | BM_JUST_DIRTIED)) + { + SpinAcquire(BufMgrLock); + Assert(bufHdr->refcount > 0); + bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); + SpinRelease(BufMgrLock); + } } void @@ -2175,7 +2398,16 @@ static void StartBufferIO(BufferDesc *buf, bool forInput) Assert(!(buf->flags & BM_IO_IN_PROGRESS)); buf->flags |= BM_IO_IN_PROGRESS; #ifdef HAS_TEST_AND_SET - Assert(S_LOCK_FREE(&(buf->io_in_progress_lock))) + /* + * There used to be + * + * Assert(S_LOCK_FREE(&(buf->io_in_progress_lock))); + * + * here, but that's wrong because of the way WaitIO works: someone else + * waiting for the I/O to complete will succeed in grabbing the lock for + * a few instructions, and if we context-swap back to here the Assert + * could fail. Tiny window for failure, but I've seen it happen -- tgl + */ S_LOCK(&(buf->io_in_progress_lock)); #endif /* HAS_TEST_AND_SET */ InProgressBuf = buf; @@ -2217,7 +2449,7 @@ static void ContinueBufferIO(BufferDesc *buf, bool forInput) IsForInput = forInput; } -extern void InitBufferIO(void) +void InitBufferIO(void) { InProgressBuf = (BufferDesc *)0; } @@ -2229,7 +2461,7 @@ extern void InitBufferIO(void) * set in case of output,this routine would kill all * backends and reset postmaster. */ -extern void AbortBufferIO(void) +void AbortBufferIO(void) { BufferDesc *buf = InProgressBuf; if (buf) @@ -2252,8 +2484,8 @@ extern void AbortBufferIO(void) buf->flags |= BM_DIRTY; } buf->flags |= BM_IO_ERROR; - TerminateBufferIO(buf); buf->flags &= ~BM_IO_IN_PROGRESS; + TerminateBufferIO(buf); SpinRelease(BufMgrLock); } } |