diff options
Diffstat (limited to 'src/backend/storage/buffer/xlog_bufmgr.c')
-rw-r--r-- | src/backend/storage/buffer/xlog_bufmgr.c | 2205 |
1 files changed, 2205 insertions, 0 deletions
diff --git a/src/backend/storage/buffer/xlog_bufmgr.c b/src/backend/storage/buffer/xlog_bufmgr.c new file mode 100644 index 00000000000..dcd377b7eb3 --- /dev/null +++ b/src/backend/storage/buffer/xlog_bufmgr.c @@ -0,0 +1,2205 @@ +/*------------------------------------------------------------------------- + * + * bufmgr.c + * buffer manager interface routines + * + * Portions Copyright (c) 1996-2000, PostgreSQL, Inc + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/xlog_bufmgr.c,v 1.1 2000/10/28 16:20:56 vadim Exp $ + * + *------------------------------------------------------------------------- + */ +/* + * + * BufferAlloc() -- lookup a buffer in the buffer table. If + * it isn't there add it, but do not read data into memory. + * This is used when we are about to reinitialize the + * buffer so don't care what the current disk contents are. + * BufferAlloc() also pins the new buffer in memory. + * + * ReadBuffer() -- like BufferAlloc() but reads the data + * on a buffer cache miss. + * + * ReleaseBuffer() -- unpin the buffer + * + * WriteNoReleaseBuffer() -- mark the buffer contents as "dirty" + * but don't unpin. The disk IO is delayed until buffer + * replacement. + * + * WriteBuffer() -- WriteNoReleaseBuffer() + ReleaseBuffer() + * + * BufferSync() -- flush all dirty buffers in the buffer pool. + * + * InitBufferPool() -- Init the buffer module. + * + * See other files: + * freelist.c -- chooses victim for buffer replacement + * buf_table.c -- manages the buffer lookup table + */ +#include <sys/types.h> +#include <sys/file.h> +#include <math.h> +#include <signal.h> + +#include "postgres.h" +#include "executor/execdebug.h" +#include "miscadmin.h" +#include "storage/s_lock.h" +#include "storage/smgr.h" +#include "utils/relcache.h" + +#ifdef XLOG +#include "catalog/pg_database.h" +#endif + +#define BufferGetLSN(bufHdr) \ + (*((XLogRecPtr*)MAKE_PTR((bufHdr)->data))) + + +extern SPINLOCK BufMgrLock; +extern long int ReadBufferCount; +extern long int ReadLocalBufferCount; +extern long int BufferHitCount; +extern long int LocalBufferHitCount; +extern long int BufferFlushCount; +extern long int LocalBufferFlushCount; + +/* + * It's used to avoid disk writes for read-only transactions + * (i.e. when no one shared buffer was changed by transaction). + * We set it to true in WriteBuffer/WriteNoReleaseBuffer when + * marking shared buffer as dirty. We set it to false in xact.c + * after transaction is committed/aborted. + */ +bool SharedBufferChanged = false; + +static void WaitIO(BufferDesc *buf, SPINLOCK spinlock); +static void StartBufferIO(BufferDesc *buf, bool forInput); +static void TerminateBufferIO(BufferDesc *buf); +static void ContinueBufferIO(BufferDesc *buf, bool forInput); +extern void AbortBufferIO(void); + +/* + * Macro : BUFFER_IS_BROKEN + * Note that write error doesn't mean the buffer broken +*/ +#define BUFFER_IS_BROKEN(buf) ((buf->flags & BM_IO_ERROR) && !(buf->flags & BM_DIRTY)) + +#ifndef HAS_TEST_AND_SET +static void SignalIO(BufferDesc *buf); +extern long *NWaitIOBackendP; /* defined in buf_init.c */ + +#endif /* HAS_TEST_AND_SET */ + +static Buffer ReadBufferWithBufferLock(Relation relation, BlockNumber blockNum, + bool bufferLockHeld); +static BufferDesc *BufferAlloc(Relation reln, BlockNumber blockNum, + bool *foundPtr, bool bufferLockHeld); +static int BufferReplace(BufferDesc *bufHdr); +void PrintBufferDescs(void); + +/* --------------------------------------------------- + * RelationGetBufferWithBuffer + * see if the given buffer is what we want + * if yes, we don't need to bother the buffer manager + * --------------------------------------------------- + */ +Buffer +RelationGetBufferWithBuffer(Relation relation, + BlockNumber blockNumber, + Buffer buffer) +{ + BufferDesc *bufHdr; + + if (BufferIsValid(buffer)) + { + if (!BufferIsLocal(buffer)) + { + bufHdr = &BufferDescriptors[buffer - 1]; + SpinAcquire(BufMgrLock); + if (bufHdr->tag.blockNum == blockNumber && + RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node)) + { + SpinRelease(BufMgrLock); + return buffer; + } + return ReadBufferWithBufferLock(relation, blockNumber, true); + } + else + { + bufHdr = &LocalBufferDescriptors[-buffer - 1]; + if (bufHdr->tag.blockNum == blockNumber && + RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node)) + return buffer; + } + } + return ReadBuffer(relation, blockNumber); +} + +/* + * ReadBuffer -- returns a buffer containing the requested + * block of the requested relation. If the blknum + * requested is P_NEW, extend the relation file and + * allocate a new block. + * + * Returns: the buffer number for the buffer containing + * the block read or NULL on an error. + * + * Assume when this function is called, that reln has been + * opened already. + */ + +#undef ReadBuffer /* conflicts with macro when BUFMGR_DEBUG + * defined */ + +/* + * ReadBuffer + * + */ +Buffer +ReadBuffer(Relation reln, BlockNumber blockNum) +{ + return ReadBufferWithBufferLock(reln, blockNum, false); +} + +/* + * ReadBufferWithBufferLock -- does the work of + * ReadBuffer() but with the possibility that + * the buffer lock has already been held. this + * is yet another effort to reduce the number of + * semops in the system. + */ +static Buffer +ReadBufferWithBufferLock(Relation reln, + BlockNumber blockNum, + bool bufferLockHeld) +{ + BufferDesc *bufHdr; + int extend; /* extending the file by one block */ + int status; + bool found; + bool isLocalBuf; + + extend = (blockNum == P_NEW); + isLocalBuf = reln->rd_myxactonly; + + if (isLocalBuf) + { + ReadLocalBufferCount++; + bufHdr = LocalBufferAlloc(reln, blockNum, &found); + if (found) + LocalBufferHitCount++; + } + else + { + ReadBufferCount++; + + /* + * lookup the buffer. IO_IN_PROGRESS is set if the requested + * block is not currently in memory. + */ + bufHdr = BufferAlloc(reln, blockNum, &found, bufferLockHeld); + if (found) + BufferHitCount++; + } + + if (!bufHdr) + return InvalidBuffer; + + /* if it's already in the buffer pool, we're done */ + if (found) + { + + /* + * This happens when a bogus buffer was returned previously and is + * floating around in the buffer pool. A routine calling this + * would want this extended. + */ + if (extend) + { + /* new buffers are zero-filled */ + MemSet((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ); + smgrextend(DEFAULT_SMGR, reln, + (char *) MAKE_PTR(bufHdr->data)); + } + return BufferDescriptorGetBuffer(bufHdr); + + } + + /* + * if we have gotten to this point, the reln pointer must be ok and + * the relation file must be open. + */ + if (extend) + { + /* new buffers are zero-filled */ + MemSet((char *) MAKE_PTR(bufHdr->data), 0, BLCKSZ); + status = smgrextend(DEFAULT_SMGR, reln, + (char *) MAKE_PTR(bufHdr->data)); + } + else + { + status = smgrread(DEFAULT_SMGR, reln, blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } + + if (isLocalBuf) + return BufferDescriptorGetBuffer(bufHdr); + + /* lock buffer manager again to update IO IN PROGRESS */ + SpinAcquire(BufMgrLock); + + if (status == SM_FAIL) + { + /* IO Failed. cleanup the data structures and go home */ + + if (!BufTableDelete(bufHdr)) + { + SpinRelease(BufMgrLock); + elog(FATAL, "BufRead: buffer table broken after IO error\n"); + } + /* remember that BufferAlloc() pinned the buffer */ + UnpinBuffer(bufHdr); + + /* + * Have to reset the flag so that anyone waiting for the buffer + * can tell that the contents are invalid. + */ + bufHdr->flags |= BM_IO_ERROR; + bufHdr->flags &= ~BM_IO_IN_PROGRESS; + } + else + { + /* IO Succeeded. clear the flags, finish buffer update */ + + bufHdr->flags &= ~(BM_IO_ERROR | BM_IO_IN_PROGRESS); + } + + /* If anyone was waiting for IO to complete, wake them up now */ + TerminateBufferIO(bufHdr); + + SpinRelease(BufMgrLock); + + if (status == SM_FAIL) + return InvalidBuffer; + + return BufferDescriptorGetBuffer(bufHdr); +} + +/* + * BufferAlloc -- Get a buffer from the buffer pool but dont + * read it. + * + * Returns: descriptor for buffer + * + * When this routine returns, the BufMgrLock is guaranteed NOT be held. + */ +static BufferDesc * +BufferAlloc(Relation reln, + BlockNumber blockNum, + bool *foundPtr, + bool bufferLockHeld) +{ + BufferDesc *buf, + *buf2; + BufferTag newTag; /* identity of requested block */ + bool inProgress; /* buffer undergoing IO */ + bool newblock = FALSE; + + /* create a new tag so we can lookup the buffer */ + /* assume that the relation is already open */ + if (blockNum == P_NEW) + { + newblock = TRUE; + blockNum = smgrnblocks(DEFAULT_SMGR, reln); + } + + INIT_BUFFERTAG(&newTag, reln, blockNum); + + if (!bufferLockHeld) + SpinAcquire(BufMgrLock); + + /* see if the block is in the buffer pool already */ + buf = BufTableLookup(&newTag); + if (buf != NULL) + { + + /* + * Found it. Now, (a) pin the buffer so no one steals it from the + * buffer pool, (b) check IO_IN_PROGRESS, someone may be faulting + * the buffer into the buffer pool. + */ + + PinBuffer(buf); + inProgress = (buf->flags & BM_IO_IN_PROGRESS); + + *foundPtr = TRUE; + if (inProgress) /* confirm end of IO */ + { + WaitIO(buf, BufMgrLock); + inProgress = (buf->flags & BM_IO_IN_PROGRESS); + } + if (BUFFER_IS_BROKEN(buf)) + { + + /* + * I couldn't understand the following old comment. If there's + * no IO for the buffer and the buffer is BROKEN,it should be + * read again. So start a new buffer IO here. + * + * wierd race condition: + * + * We were waiting for someone else to read the buffer. While we + * were waiting, the reader boof'd in some way, so the + * contents of the buffer are still invalid. By saying that + * we didn't find it, we can make the caller reinitialize the + * buffer. If two processes are waiting for this block, both + * will read the block. The second one to finish may + * overwrite any updates made by the first. (Assume higher + * level synchronization prevents this from happening). + * + * This is never going to happen, don't worry about it. + */ + *foundPtr = FALSE; + } +#ifdef BMTRACE + _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), RelationGetRelid(reln), blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCFND); +#endif /* BMTRACE */ + + if (!(*foundPtr)) + StartBufferIO(buf, true); + SpinRelease(BufMgrLock); + + return buf; + } + + *foundPtr = FALSE; + + /* + * Didn't find it in the buffer pool. We'll have to initialize a new + * buffer. First, grab one from the free list. If it's dirty, flush + * it to disk. Remember to unlock BufMgr spinlock while doing the IOs. + */ + inProgress = FALSE; + for (buf = (BufferDesc *) NULL; buf == (BufferDesc *) NULL;) + { + buf = GetFreeBuffer(); + + /* GetFreeBuffer will abort if it can't find a free buffer */ + Assert(buf); + + /* + * There should be exactly one pin on the buffer after it is + * allocated -- ours. If it had a pin it wouldn't have been on + * the free list. No one else could have pinned it between + * GetFreeBuffer and here because we have the BufMgrLock. + */ + Assert(buf->refcount == 0); + buf->refcount = 1; + PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 1; + + if (buf->flags & BM_DIRTY || buf->cntxDirty) + { + bool smok; + + /* + * skip write error buffers + */ + if ((buf->flags & BM_IO_ERROR) != 0) + { + PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; + buf->refcount--; + buf = (BufferDesc *) NULL; + continue; + } + /* + * Set BM_IO_IN_PROGRESS to keep anyone from doing anything + * with the contents of the buffer while we write it out. We + * don't really care if they try to read it, but if they can + * complete a BufferAlloc on it they can then scribble into + * it, and we'd really like to avoid that while we are + * flushing the buffer. Setting this flag should block them + * in WaitIO until we're done. + */ + inProgress = TRUE; + + /* + * All code paths that acquire this lock pin the buffer first; + * since no one had it pinned (it just came off the free + * list), no one else can have this lock. + */ + StartBufferIO(buf, false); + + /* + * Write the buffer out, being careful to release BufMgrLock + * before starting the I/O. + */ + smok = BufferReplace(buf); + + if (smok == FALSE) + { + elog(NOTICE, "BufferAlloc: cannot write block %u for %s/%s", + buf->tag.blockNum, buf->blind.dbname, buf->blind.relname); + inProgress = FALSE; + buf->flags |= BM_IO_ERROR; + buf->flags &= ~BM_IO_IN_PROGRESS; + TerminateBufferIO(buf); + PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; + Assert(buf->refcount > 0); + buf->refcount--; + if (buf->refcount == 0) + { + AddBufferToFreelist(buf); + buf->flags |= BM_FREE; + } + buf = (BufferDesc *) NULL; + } + else + { + /* + * BM_JUST_DIRTIED cleared by BufferReplace and shouldn't + * be setted by anyone. - vadim 01/17/97 + */ + if (buf->flags & BM_JUST_DIRTIED) + { + elog(STOP, "BufferAlloc: content of block %u (%s) changed while flushing", + buf->tag.blockNum, buf->blind.relname); + } + else + buf->flags &= ~BM_DIRTY; + buf->cntxDirty = false; + } + + /* + * Somebody could have pinned the buffer while we were doing + * the I/O and had given up the BufMgrLock (though they would + * be waiting for us to clear the BM_IO_IN_PROGRESS flag). + * That's why this is a loop -- if so, we need to clear the + * I/O flags, remove our pin and start all over again. + * + * People may be making buffers free at any time, so there's no + * reason to think that we have an immediate disaster on our + * hands. + */ + if (buf && buf->refcount > 1) + { + inProgress = FALSE; + buf->flags &= ~BM_IO_IN_PROGRESS; + TerminateBufferIO(buf); + PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; + buf->refcount--; + buf = (BufferDesc *) NULL; + } + + /* + * Somebody could have allocated another buffer for the same + * block we are about to read in. (While we flush out the + * dirty buffer, we don't hold the lock and someone could have + * allocated another buffer for the same block. The problem is + * we haven't gotten around to insert the new tag into the + * buffer table. So we need to check here. -ay 3/95 + */ + buf2 = BufTableLookup(&newTag); + if (buf2 != NULL) + { + + /* + * Found it. Someone has already done what we're about to + * do. We'll just handle this as if it were found in the + * buffer pool in the first place. + */ + if (buf != NULL) + { + buf->flags &= ~BM_IO_IN_PROGRESS; + TerminateBufferIO(buf); + /* give up the buffer since we don't need it any more */ + PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0; + Assert(buf->refcount > 0); + buf->refcount--; + if (buf->refcount == 0) + { + AddBufferToFreelist(buf); + buf->flags |= BM_FREE; + } + } + + PinBuffer(buf2); + inProgress = (buf2->flags & BM_IO_IN_PROGRESS); + + *foundPtr = TRUE; + if (inProgress) + { + WaitIO(buf2, BufMgrLock); + inProgress = (buf2->flags & BM_IO_IN_PROGRESS); + } + if (BUFFER_IS_BROKEN(buf2)) + *foundPtr = FALSE; + + if (!(*foundPtr)) + StartBufferIO(buf2, true); + SpinRelease(BufMgrLock); + + return buf2; + } + } + } + + /* + * At this point we should have the sole pin on a non-dirty buffer and + * we may or may not already have the BM_IO_IN_PROGRESS flag set. + */ + + /* + * Change the name of the buffer in the lookup table: + * + * Need to update the lookup table before the read starts. If someone + * comes along looking for the buffer while we are reading it in, we + * don't want them to allocate a new buffer. For the same reason, we + * didn't want to erase the buf table entry for the buffer we were + * writing back until now, either. + */ + + if (!BufTableDelete(buf)) + { + SpinRelease(BufMgrLock); + elog(FATAL, "buffer wasn't in the buffer table\n"); + } + + /* record the database name and relation name for this buffer */ + strcpy(buf->blind.dbname, (DatabaseName) ? DatabaseName : "Recovery"); + strcpy(buf->blind.relname, RelationGetPhysicalRelationName(reln)); + + INIT_BUFFERTAG(&(buf->tag), reln, blockNum); + if (!BufTableInsert(buf)) + { + SpinRelease(BufMgrLock); + elog(FATAL, "Buffer in lookup table twice \n"); + } + + /* + * Buffer contents are currently invalid. Have to mark IO IN PROGRESS + * so no one fiddles with them until the read completes. If this + * routine has been called simply to allocate a buffer, no io will be + * attempted, so the flag isnt set. + */ + if (!inProgress) + StartBufferIO(buf, true); + else + ContinueBufferIO(buf, true); + +#ifdef BMTRACE + _bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), RelationGetRelid(reln), blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCNOTFND); +#endif /* BMTRACE */ + + SpinRelease(BufMgrLock); + + return buf; +} + +/* + * WriteBuffer + * + * Marks buffer contents as dirty (actual write happens later). + * + * Assume that buffer is pinned. Assume that reln is + * valid. + * + * Side Effects: + * Pin count is decremented. + */ + +#undef WriteBuffer + +int +WriteBuffer(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (BufferIsLocal(buffer)) + return WriteLocalBuffer(buffer, TRUE); + + if (BAD_BUFFER_ID(buffer)) + return FALSE; + + bufHdr = &BufferDescriptors[buffer - 1]; + + SharedBufferChanged = true; + + SpinAcquire(BufMgrLock); + Assert(bufHdr->refcount > 0); + + bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); + + UnpinBuffer(bufHdr); + SpinRelease(BufMgrLock); + + return TRUE; +} + +/* + * WriteNoReleaseBuffer -- like WriteBuffer, but do not unpin the buffer + * when the operation is complete. + */ +int +WriteNoReleaseBuffer(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (BufferIsLocal(buffer)) + return WriteLocalBuffer(buffer, FALSE); + + if (BAD_BUFFER_ID(buffer)) + return STATUS_ERROR; + + bufHdr = &BufferDescriptors[buffer - 1]; + + SharedBufferChanged = true; + + SpinAcquire(BufMgrLock); + Assert(bufHdr->refcount > 0); + + bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); + + SpinRelease(BufMgrLock); + + return STATUS_OK; +} + + +#undef ReleaseAndReadBuffer +/* + * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer() + * so that only one semop needs to be called. + * + */ +Buffer +ReleaseAndReadBuffer(Buffer buffer, + Relation relation, + BlockNumber blockNum) +{ + BufferDesc *bufHdr; + Buffer retbuf; + + if (BufferIsLocal(buffer)) + { + Assert(LocalRefCount[-buffer - 1] > 0); + LocalRefCount[-buffer - 1]--; + } + else + { + if (BufferIsValid(buffer)) + { + bufHdr = &BufferDescriptors[buffer - 1]; + Assert(PrivateRefCount[buffer - 1] > 0); + PrivateRefCount[buffer - 1]--; + if (PrivateRefCount[buffer - 1] == 0) + { + SpinAcquire(BufMgrLock); + Assert(bufHdr->refcount > 0); + bufHdr->refcount--; + if (bufHdr->refcount == 0) + { + AddBufferToFreelist(bufHdr); + bufHdr->flags |= BM_FREE; + } + retbuf = ReadBufferWithBufferLock(relation, blockNum, true); + return retbuf; + } + } + } + + return ReadBuffer(relation, blockNum); +} + +/* + * BufferSync -- Write all dirty buffers in the pool. + * + * This is called at checkpoint time and write out all dirty buffers. + */ +void +BufferSync() +{ + int i; + BufferDesc *bufHdr; + Buffer buffer; + int status; + RelFileNode rnode; + XLogRecPtr recptr; + Relation reln = NULL; + + for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) + { + + SpinAcquire(BufMgrLock); + + if (!(bufHdr->flags & BM_VALID)) + { + SpinRelease(BufMgrLock); + continue; + } + + /* + * Pin buffer and ensure that no one reads it from disk + */ + PinBuffer(bufHdr); + /* Synchronize with BufferAlloc */ + if (bufHdr->flags & BM_IO_IN_PROGRESS) + WaitIO(bufHdr, BufMgrLock); + + buffer = BufferDescriptorGetBuffer(bufHdr); + rnode = bufHdr->tag.rnode; + + SpinRelease(BufMgrLock); + + /* + * Try to find relation for buffer + */ + reln = RelationNodeCacheGetRelation(rnode); + + /* + * Protect buffer content against concurrent update + */ + LockBuffer(buffer, BUFFER_LOCK_SHARE); + + /* + * Force XLOG flush for buffer' LSN + */ + recptr = BufferGetLSN(bufHdr); + XLogFlush(recptr); + + /* + * Now it's safe to write buffer to disk + * (if needed at all -:)) + */ + + SpinAcquire(BufMgrLock); + if (bufHdr->flags & BM_IO_IN_PROGRESS) + WaitIO(bufHdr, BufMgrLock); + + if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) + { + bufHdr->flags &= ~BM_JUST_DIRTIED; + StartBufferIO(bufHdr, false); /* output IO start */ + + SpinRelease(BufMgrLock); + + if (reln == (Relation) NULL) + { + status = smgrblindwrt(DEFAULT_SMGR, + bufHdr->tag.rnode, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data), + true); /* must fsync */ + } + else + { + status = smgrwrite(DEFAULT_SMGR, reln, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } + + if (status == SM_FAIL) /* disk failure ?! */ + elog(STOP, "BufferSync: cannot write %u for %s", + bufHdr->tag.blockNum, bufHdr->blind.relname); + + /* + * Note that it's safe to change cntxDirty here because of + * we protect it from upper writers by share lock and from + * other bufmgr routines by BM_IO_IN_PROGRESS + */ + bufHdr->cntxDirty = false; + + /* + * Release the per-buffer readlock, reacquire BufMgrLock. + */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + BufferFlushCount++; + + SpinAcquire(BufMgrLock); + + bufHdr->flags &= ~BM_IO_IN_PROGRESS; /* mark IO finished */ + TerminateBufferIO(bufHdr); /* Sync IO finished */ + + /* + * If this buffer was marked by someone as DIRTY while + * we were flushing it out we must not clear DIRTY + * flag - vadim 01/17/97 + */ + if (!(bufHdr->flags & BM_JUST_DIRTIED)) + bufHdr->flags &= ~BM_DIRTY; + } + else + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + UnpinBuffer(bufHdr); + + SpinRelease(BufMgrLock); + + /* drop refcnt obtained by RelationIdCacheGetRelation */ + if (reln != (Relation) NULL) + { + RelationDecrementReferenceCount(reln); + reln = NULL; + } + } + +} + +/* + * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared. + * + * Should be entered with buffer manager spinlock held; releases it before + * waiting and re-acquires it afterwards. + * + * OLD NOTES: + * Because IO_IN_PROGRESS conflicts are + * expected to be rare, there is only one BufferIO + * lock in the entire system. All processes block + * on this semaphore when they try to use a buffer + * that someone else is faulting in. Whenever a + * process finishes an IO and someone is waiting for + * the buffer, BufferIO is signaled (SignalIO). All + * waiting processes then wake up and check to see + * if their buffer is now ready. This implementation + * is simple, but efficient enough if WaitIO is + * rarely called by multiple processes simultaneously. + * + * NEW NOTES: + * The above is true only on machines without test-and-set + * semaphores (which we hope are few, these days). On better + * hardware, each buffer has a spinlock that we can wait on. + */ +#ifdef HAS_TEST_AND_SET + +static void +WaitIO(BufferDesc *buf, SPINLOCK spinlock) +{ + + /* + * Changed to wait until there's no IO - Inoue 01/13/2000 + */ + while ((buf->flags & BM_IO_IN_PROGRESS) != 0) + { + SpinRelease(spinlock); + S_LOCK(&(buf->io_in_progress_lock)); + S_UNLOCK(&(buf->io_in_progress_lock)); + SpinAcquire(spinlock); + } +} + +#else /* !HAS_TEST_AND_SET */ + +IpcSemaphoreId WaitIOSemId; +IpcSemaphoreId WaitCLSemId; + +static void +WaitIO(BufferDesc *buf, SPINLOCK spinlock) +{ + bool inProgress; + + for (;;) + { + + /* wait until someone releases IO lock */ + (*NWaitIOBackendP)++; + SpinRelease(spinlock); + IpcSemaphoreLock(WaitIOSemId, 0, 1); + SpinAcquire(spinlock); + inProgress = (buf->flags & BM_IO_IN_PROGRESS); + if (!inProgress) + break; + } +} + +/* + * SignalIO + */ +static void +SignalIO(BufferDesc *buf) +{ + /* somebody better be waiting. */ + Assert(buf->refcount > 1); + IpcSemaphoreUnlock(WaitIOSemId, 0, *NWaitIOBackendP); + *NWaitIOBackendP = 0; +} + +#endif /* HAS_TEST_AND_SET */ + +long NDirectFileRead; /* some I/O's are direct file access. + * bypass bufmgr */ +long NDirectFileWrite; /* e.g., I/O in psort and hashjoin. */ + +void +PrintBufferUsage(FILE *statfp) +{ + float hitrate; + float localhitrate; + + if (ReadBufferCount == 0) + hitrate = 0.0; + else + hitrate = (float) BufferHitCount *100.0 / ReadBufferCount; + + if (ReadLocalBufferCount == 0) + localhitrate = 0.0; + else + localhitrate = (float) LocalBufferHitCount *100.0 / ReadLocalBufferCount; + + fprintf(statfp, "!\tShared blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n", + ReadBufferCount - BufferHitCount, BufferFlushCount, hitrate); + fprintf(statfp, "!\tLocal blocks: %10ld read, %10ld written, buffer hit rate = %.2f%%\n", + ReadLocalBufferCount - LocalBufferHitCount, LocalBufferFlushCount, localhitrate); + fprintf(statfp, "!\tDirect blocks: %10ld read, %10ld written\n", + NDirectFileRead, NDirectFileWrite); +} + +void +ResetBufferUsage() +{ + BufferHitCount = 0; + ReadBufferCount = 0; + BufferFlushCount = 0; + LocalBufferHitCount = 0; + ReadLocalBufferCount = 0; + LocalBufferFlushCount = 0; + NDirectFileRead = 0; + NDirectFileWrite = 0; +} + +/* ---------------------------------------------- + * ResetBufferPool + * + * This routine is supposed to be called when a transaction aborts. + * it will release all the buffer pins held by the transaction. + * Currently, we also call it during commit if BufferPoolCheckLeak + * detected a problem --- in that case, isCommit is TRUE, and we + * only clean up buffer pin counts. + * + * During abort, we also forget any pending fsync requests. Dirtied buffers + * will still get written, eventually, but there will be no fsync for them. + * + * ---------------------------------------------- + */ +void +ResetBufferPool(bool isCommit) +{ + int i; + + for (i = 0; i < NBuffers; i++) + { + if (PrivateRefCount[i] != 0) + { + BufferDesc *buf = &BufferDescriptors[i]; + + SpinAcquire(BufMgrLock); + Assert(buf->refcount > 0); + buf->refcount--; + if (buf->refcount == 0) + { + AddBufferToFreelist(buf); + buf->flags |= BM_FREE; + } + SpinRelease(BufMgrLock); + } + PrivateRefCount[i] = 0; + } + + ResetLocalBufferPool(); + + if (!isCommit) + smgrabort(); +} + +/* ----------------------------------------------- + * BufferPoolCheckLeak + * + * check if there is buffer leak + * + * ----------------------------------------------- + */ +int +BufferPoolCheckLeak() +{ + int i; + int result = 0; + + for (i = 1; i <= NBuffers; i++) + { + if (PrivateRefCount[i - 1] != 0) + { + BufferDesc *buf = &(BufferDescriptors[i - 1]); + + elog(NOTICE, + "Buffer Leak: [%03d] (freeNext=%ld, freePrev=%ld, \ +relname=%s, blockNum=%d, flags=0x%x, refcount=%d %ld)", + i - 1, buf->freeNext, buf->freePrev, + buf->blind.relname, buf->tag.blockNum, buf->flags, + buf->refcount, PrivateRefCount[i - 1]); + result = 1; + } + } + return result; +} + +/* ------------------------------------------------ + * FlushBufferPool + * + * Flush all dirty blocks in buffer pool to disk + * at the checkpoint time + * ------------------------------------------------ + */ +void +FlushBufferPool(void) +{ + BufferSync(); + smgrsync(); +} + +/* + * At the commit time we have to flush local buffer pool only + */ +void +BufmgrCommit(void) +{ + LocalBufferSync(); + smgrcommit(); +} + +/* + * BufferGetBlockNumber + * Returns the block number associated with a buffer. + * + * Note: + * Assumes that the buffer is valid. + */ +BlockNumber +BufferGetBlockNumber(Buffer buffer) +{ + Assert(BufferIsValid(buffer)); + + /* XXX should be a critical section */ + if (BufferIsLocal(buffer)) + return LocalBufferDescriptors[-buffer - 1].tag.blockNum; + else + return BufferDescriptors[buffer - 1].tag.blockNum; +} + +/* + * BufferReplace + * + * Write out the buffer corresponding to 'bufHdr' + * + * BufMgrLock must be held at entry, and the buffer must be pinned. + */ +static int +BufferReplace(BufferDesc *bufHdr) +{ + Relation reln; + XLogRecPtr recptr; + int status; + + /* To check if block content changed while flushing. - vadim 01/17/97 */ + bufHdr->flags &= ~BM_JUST_DIRTIED; + + SpinRelease(BufMgrLock); + + /* + * No need to lock buffer context - no one should be able to + * end ReadBuffer + */ + recptr = BufferGetLSN(bufHdr); + XLogFlush(recptr); + + reln = RelationNodeCacheGetRelation(bufHdr->tag.rnode); + + if (reln != (Relation) NULL) + { + status = smgrwrite(DEFAULT_SMGR, reln, bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + } + else + { + status = smgrblindwrt(DEFAULT_SMGR, bufHdr->tag.rnode, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data), + false); /* no fsync */ + } + + /* drop relcache refcnt incremented by RelationIdCacheGetRelation */ + if (reln != (Relation) NULL) + RelationDecrementReferenceCount(reln); + + SpinAcquire(BufMgrLock); + + if (status == SM_FAIL) + return FALSE; + + BufferFlushCount++; + + return TRUE; +} + +/* + * RelationGetNumberOfBlocks + * Returns the buffer descriptor associated with a page in a relation. + * + * Note: + * XXX may fail for huge relations. + * XXX should be elsewhere. + * XXX maybe should be hidden + */ +BlockNumber +RelationGetNumberOfBlocks(Relation relation) +{ + return ((relation->rd_myxactonly) ? relation->rd_nblocks : + smgrnblocks(DEFAULT_SMGR, relation)); +} + +/* --------------------------------------------------------------------- + * ReleaseRelationBuffers + * + * This function removes all the buffered pages for a relation + * from the buffer pool. Dirty pages are simply dropped, without + * bothering to write them out first. This is used when the + * relation is about to be deleted. We assume that the caller + * holds an exclusive lock on the relation, which should assure + * that no new buffers will be acquired for the rel meanwhile. + * + * XXX currently it sequentially searches the buffer pool, should be + * changed to more clever ways of searching. + * -------------------------------------------------------------------- + */ +void +ReleaseRelationBuffers(Relation rel) +{ + int i; + BufferDesc *bufHdr; + + if (rel->rd_myxactonly) + { + for (i = 0; i < NLocBuffer; i++) + { + bufHdr = &LocalBufferDescriptors[i]; + if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) + { + bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); + bufHdr->cntxDirty = false; + LocalRefCount[i] = 0; + bufHdr->tag.rnode.relNode = InvalidOid; + } + } + return; + } + + SpinAcquire(BufMgrLock); + for (i = 1; i <= NBuffers; i++) + { + bufHdr = &BufferDescriptors[i - 1]; +recheck: + if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) + { + + /* + * If there is I/O in progress, better wait till it's done; + * don't want to delete the relation out from under someone + * who's just trying to flush the buffer! + */ + if (bufHdr->flags & BM_IO_IN_PROGRESS) + { + WaitIO(bufHdr, BufMgrLock); + + /* + * By now, the buffer very possibly belongs to some other + * rel, so check again before proceeding. + */ + goto recheck; + } + /* Now we can do what we came for */ + bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); + bufHdr->cntxDirty = false; + + /* + * Release any refcount we may have. + * + * This is very probably dead code, and if it isn't then it's + * probably wrong. I added the Assert to find out --- tgl + * 11/99. + */ + if (!(bufHdr->flags & BM_FREE)) + { + /* Assert checks that buffer will actually get freed! */ + Assert(PrivateRefCount[i - 1] == 1 && + bufHdr->refcount == 1); + /* ReleaseBuffer expects we do not hold the lock at entry */ + SpinRelease(BufMgrLock); + ReleaseBuffer(i); + SpinAcquire(BufMgrLock); + } + /* + * And mark the buffer as no longer occupied by this rel. + */ + BufTableDelete(bufHdr); + } + } + + SpinRelease(BufMgrLock); +} + +/* --------------------------------------------------------------------- + * DropBuffers + * + * This function removes all the buffers in the buffer cache for a + * particular database. Dirty pages are simply dropped, without + * bothering to write them out first. This is used when we destroy a + * database, to avoid trying to flush data to disk when the directory + * tree no longer exists. Implementation is pretty similar to + * ReleaseRelationBuffers() which is for destroying just one relation. + * -------------------------------------------------------------------- + */ +void +DropBuffers(Oid dbid) +{ + int i; + BufferDesc *bufHdr; + + SpinAcquire(BufMgrLock); + for (i = 1; i <= NBuffers; i++) + { + bufHdr = &BufferDescriptors[i - 1]; +recheck: + /* + * We know that currently database OID is tblNode but + * this probably will be changed in future and this + * func will be used to drop tablespace buffers. + */ + if (bufHdr->tag.rnode.tblNode == dbid) + { + + /* + * If there is I/O in progress, better wait till it's done; + * don't want to delete the database out from under someone + * who's just trying to flush the buffer! + */ + if (bufHdr->flags & BM_IO_IN_PROGRESS) + { + WaitIO(bufHdr, BufMgrLock); + + /* + * By now, the buffer very possibly belongs to some other + * DB, so check again before proceeding. + */ + goto recheck; + } + /* Now we can do what we came for */ + bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); + bufHdr->cntxDirty = false; + + /* + * The thing should be free, if caller has checked that no + * backends are running in that database. + */ + Assert(bufHdr->flags & BM_FREE); + /* + * And mark the buffer as no longer occupied by this page. + */ + BufTableDelete(bufHdr); + } + } + SpinRelease(BufMgrLock); +} + +/* ----------------------------------------------------------------- + * PrintBufferDescs + * + * this function prints all the buffer descriptors, for debugging + * use only. + * ----------------------------------------------------------------- + */ +void +PrintBufferDescs() +{ + int i; + BufferDesc *buf = BufferDescriptors; + + if (IsUnderPostmaster) + { + SpinAcquire(BufMgrLock); + for (i = 0; i < NBuffers; ++i, ++buf) + { + elog(DEBUG, "[%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \ +blockNum=%d, flags=0x%x, refcount=%d %ld)", + i, buf->freeNext, buf->freePrev, + buf->blind.relname, buf->tag.blockNum, buf->flags, + buf->refcount, PrivateRefCount[i]); + } + SpinRelease(BufMgrLock); + } + else + { + /* interactive backend */ + for (i = 0; i < NBuffers; ++i, ++buf) + { + printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld)\n", + i, buf->blind.relname, buf->tag.blockNum, + buf->flags, buf->refcount, PrivateRefCount[i]); + } + } +} + +void +PrintPinnedBufs() +{ + int i; + BufferDesc *buf = BufferDescriptors; + + SpinAcquire(BufMgrLock); + for (i = 0; i < NBuffers; ++i, ++buf) + { + if (PrivateRefCount[i] > 0) + elog(NOTICE, "[%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \ +blockNum=%d, flags=0x%x, refcount=%d %ld)\n", + i, buf->freeNext, buf->freePrev, buf->blind.relname, + buf->tag.blockNum, buf->flags, + buf->refcount, PrivateRefCount[i]); + } + SpinRelease(BufMgrLock); +} + +/* + * BufferPoolBlowaway + * + * this routine is solely for the purpose of experiments -- sometimes + * you may want to blowaway whatever is left from the past in buffer + * pool and start measuring some performance with a clean empty buffer + * pool. + */ +#ifdef NOT_USED +void +BufferPoolBlowaway() +{ + int i; + + BufferSync(); + for (i = 1; i <= NBuffers; i++) + { + if (BufferIsValid(i)) + { + while (BufferIsValid(i)) + ReleaseBuffer(i); + } + BufTableDelete(&BufferDescriptors[i - 1]); + } +} + +#endif + +/* --------------------------------------------------------------------- + * FlushRelationBuffers + * + * This function flushes all dirty pages of a relation out to disk. + * Furthermore, pages that have blocknumber >= firstDelBlock are + * actually removed from the buffer pool. An error code is returned + * if we fail to dump a dirty buffer or if we find one of + * the target pages is pinned into the cache. + * + * This is used by VACUUM before truncating the relation to the given + * number of blocks. (TRUNCATE TABLE also uses it in the same way.) + * It might seem unnecessary to flush dirty pages before firstDelBlock, + * since VACUUM should already have committed its changes. However, + * it is possible for there still to be dirty pages: if some page + * had unwritten on-row tuple status updates from a prior transaction, + * and VACUUM had no additional changes to make to that page, then + * VACUUM won't have written it. This is harmless in most cases but + * will break pg_upgrade, which relies on VACUUM to ensure that *all* + * tuples have correct on-row status. So, we check and flush all + * dirty pages of the rel regardless of block number. + * + * This is also used by RENAME TABLE (with firstDelBlock = 0) + * to clear out the buffer cache before renaming the physical files of + * a relation. Without that, some other backend might try to do a + * blind write of a buffer page (relying on the BlindId of the buffer) + * and fail because it's not got the right filename anymore. + * + * In all cases, the caller should be holding AccessExclusiveLock on + * the target relation to ensure that no other backend is busy reading + * more blocks of the relation. + * + * Formerly, we considered it an error condition if we found dirty + * buffers here. However, since BufferSync no longer forces out all + * dirty buffers at every xact commit, it's possible for dirty buffers + * to still be present in the cache due to failure of an earlier + * transaction. So, must flush dirty buffers without complaint. + * + * Returns: 0 - Ok, -1 - FAILED TO WRITE DIRTY BUFFER, -2 - PINNED + * + * XXX currently it sequentially searches the buffer pool, should be + * changed to more clever ways of searching. + * -------------------------------------------------------------------- + */ +int +FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock) +{ + int i; + BufferDesc *bufHdr; + XLogRecPtr recptr; + int status; + + if (rel->rd_myxactonly) + { + for (i = 0; i < NLocBuffer; i++) + { + bufHdr = &LocalBufferDescriptors[i]; + if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) + { + if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) + { + status = smgrwrite(DEFAULT_SMGR, rel, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + if (status == SM_FAIL) + { + elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it", + RelationGetRelationName(rel), firstDelBlock, + bufHdr->tag.blockNum); + return(-1); + } + bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); + bufHdr->cntxDirty = false; + } + if (LocalRefCount[i] > 0) + { + elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is referenced (%ld)", + RelationGetRelationName(rel), firstDelBlock, + bufHdr->tag.blockNum, LocalRefCount[i]); + return(-2); + } + if (bufHdr->tag.blockNum >= firstDelBlock) + { + bufHdr->tag.rnode.relNode = InvalidOid; + } + } + } + return 0; + } + + SpinAcquire(BufMgrLock); + for (i = 0; i < NBuffers; i++) + { + bufHdr = &BufferDescriptors[i]; + if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) + { + if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) + { + PinBuffer(bufHdr); + if (bufHdr->flags & BM_IO_IN_PROGRESS) + WaitIO(bufHdr, BufMgrLock); + SpinRelease(BufMgrLock); + + /* + * Force XLOG flush for buffer' LSN + */ + recptr = BufferGetLSN(bufHdr); + XLogFlush(recptr); + + /* + * Now it's safe to write buffer to disk + */ + + SpinAcquire(BufMgrLock); + if (bufHdr->flags & BM_IO_IN_PROGRESS) + WaitIO(bufHdr, BufMgrLock); + + if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty) + { + bufHdr->flags &= ~BM_JUST_DIRTIED; + StartBufferIO(bufHdr, false); /* output IO start */ + + SpinRelease(BufMgrLock); + + status = smgrwrite(DEFAULT_SMGR, rel, + bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + + if (status == SM_FAIL) /* disk failure ?! */ + elog(STOP, "FlushRelationBuffers: cannot write %u for %s", + bufHdr->tag.blockNum, bufHdr->blind.relname); + + BufferFlushCount++; + + SpinAcquire(BufMgrLock); + bufHdr->flags &= ~BM_IO_IN_PROGRESS; + TerminateBufferIO(bufHdr); + Assert(!(bufHdr->flags & BM_JUST_DIRTIED)); + bufHdr->flags &= ~BM_DIRTY; + /* + * Note that it's safe to change cntxDirty here because + * of we protect it from upper writers by + * AccessExclusiveLock and from other bufmgr routines + * by BM_IO_IN_PROGRESS + */ + bufHdr->cntxDirty = false; + } + UnpinBuffer(bufHdr); + } + if (!(bufHdr->flags & BM_FREE)) + { + SpinRelease(BufMgrLock); + elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is referenced (private %ld, global %d)", + RelationGetRelationName(rel), firstDelBlock, + bufHdr->tag.blockNum, + PrivateRefCount[i], bufHdr->refcount); + return -2; + } + if (bufHdr->tag.blockNum >= firstDelBlock) + { + BufTableDelete(bufHdr); + } + } + } + SpinRelease(BufMgrLock); + return 0; +} + +#undef ReleaseBuffer + +/* + * ReleaseBuffer -- remove the pin on a buffer without + * marking it dirty. + * + */ +int +ReleaseBuffer(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (BufferIsLocal(buffer)) + { + Assert(LocalRefCount[-buffer - 1] > 0); + LocalRefCount[-buffer - 1]--; + return STATUS_OK; + } + + if (BAD_BUFFER_ID(buffer)) + return STATUS_ERROR; + + bufHdr = &BufferDescriptors[buffer - 1]; + + Assert(PrivateRefCount[buffer - 1] > 0); + PrivateRefCount[buffer - 1]--; + if (PrivateRefCount[buffer - 1] == 0) + { + SpinAcquire(BufMgrLock); + Assert(bufHdr->refcount > 0); + bufHdr->refcount--; + if (bufHdr->refcount == 0) + { + AddBufferToFreelist(bufHdr); + bufHdr->flags |= BM_FREE; + } + SpinRelease(BufMgrLock); + } + + return STATUS_OK; +} + +#ifdef NOT_USED +void +IncrBufferRefCount_Debug(char *file, int line, Buffer buffer) +{ + IncrBufferRefCount(buffer); + if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) + { + BufferDesc *buf = &BufferDescriptors[buffer - 1]; + + fprintf(stderr, "PIN(Incr) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->blind.relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } +} + +#endif + +#ifdef NOT_USED +void +ReleaseBuffer_Debug(char *file, int line, Buffer buffer) +{ + ReleaseBuffer(buffer); + if (ShowPinTrace && !BufferIsLocal(buffer) && is_userbuffer(buffer)) + { + BufferDesc *buf = &BufferDescriptors[buffer - 1]; + + fprintf(stderr, "UNPIN(Rel) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->blind.relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } +} + +#endif + +#ifdef NOT_USED +int +ReleaseAndReadBuffer_Debug(char *file, + int line, + Buffer buffer, + Relation relation, + BlockNumber blockNum) +{ + bool bufferValid; + Buffer b; + + bufferValid = BufferIsValid(buffer); + b = ReleaseAndReadBuffer(buffer, relation, blockNum); + if (ShowPinTrace && bufferValid && BufferIsLocal(buffer) + && is_userbuffer(buffer)) + { + BufferDesc *buf = &BufferDescriptors[buffer - 1]; + + fprintf(stderr, "UNPIN(Rel&Rd) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + buffer, buf->blind.relname, buf->tag.blockNum, + PrivateRefCount[buffer - 1], file, line); + } + if (ShowPinTrace && BufferIsLocal(buffer) && is_userbuffer(buffer)) + { + BufferDesc *buf = &BufferDescriptors[b - 1]; + + fprintf(stderr, "PIN(Rel&Rd) %ld relname = %s, blockNum = %d, \ +refcount = %ld, file: %s, line: %d\n", + b, buf->blind.relname, buf->tag.blockNum, + PrivateRefCount[b - 1], file, line); + } + return b; +} + +#endif + +#ifdef BMTRACE + +/* + * trace allocations and deallocations in a circular buffer in + * shared memory. check the buffer before doing the allocation, + * and die if there's anything fishy. + */ + +_bm_trace(Oid dbId, Oid relId, int blkNo, int bufNo, int allocType) +{ + long start, + cur; + bmtrace *tb; + + start = *CurTraceBuf; + + if (start > 0) + cur = start - 1; + else + cur = BMT_LIMIT - 1; + + for (;;) + { + tb = &TraceBuf[cur]; + if (tb->bmt_op != BMT_NOTUSED) + { + if (tb->bmt_buf == bufNo) + { + if ((tb->bmt_op == BMT_DEALLOC) + || (tb->bmt_dbid == dbId && tb->bmt_relid == relId + && tb->bmt_blkno == blkNo)) + goto okay; + + /* die holding the buffer lock */ + _bm_die(dbId, relId, blkNo, bufNo, allocType, start, cur); + } + } + + if (cur == start) + goto okay; + + if (cur == 0) + cur = BMT_LIMIT - 1; + else + cur--; + } + +okay: + tb = &TraceBuf[start]; + tb->bmt_pid = MyProcPid; + tb->bmt_buf = bufNo; + tb->bmt_dbid = dbId; + tb->bmt_relid = relId; + tb->bmt_blkno = blkNo; + tb->bmt_op = allocType; + + *CurTraceBuf = (start + 1) % BMT_LIMIT; +} + +_bm_die(Oid dbId, Oid relId, int blkNo, int bufNo, + int allocType, long start, long cur) +{ + FILE *fp; + bmtrace *tb; + int i; + + tb = &TraceBuf[cur]; + + if ((fp = AllocateFile("/tmp/death_notice", "w")) == NULL) + elog(FATAL, "buffer alloc trace error and can't open log file"); + + fprintf(fp, "buffer alloc trace detected the following error:\n\n"); + fprintf(fp, " buffer %d being %s inconsistently with a previous %s\n\n", + bufNo, (allocType == BMT_DEALLOC ? "deallocated" : "allocated"), + (tb->bmt_op == BMT_DEALLOC ? "deallocation" : "allocation")); + + fprintf(fp, "the trace buffer contains:\n"); + + i = start; + for (;;) + { + tb = &TraceBuf[i]; + if (tb->bmt_op != BMT_NOTUSED) + { + fprintf(fp, " [%3d]%spid %d buf %2d for <%d,%u,%d> ", + i, (i == cur ? " ---> " : "\t"), + tb->bmt_pid, tb->bmt_buf, + tb->bmt_dbid, tb->bmt_relid, tb->bmt_blkno); + + switch (tb->bmt_op) + { + case BMT_ALLOCFND: + fprintf(fp, "allocate (found)\n"); + break; + + case BMT_ALLOCNOTFND: + fprintf(fp, "allocate (not found)\n"); + break; + + case BMT_DEALLOC: + fprintf(fp, "deallocate\n"); + break; + + default: + fprintf(fp, "unknown op type %d\n", tb->bmt_op); + break; + } + } + + i = (i + 1) % BMT_LIMIT; + if (i == start) + break; + } + + fprintf(fp, "\noperation causing error:\n"); + fprintf(fp, "\tpid %d buf %d for <%d,%u,%d> ", + getpid(), bufNo, dbId, relId, blkNo); + + switch (allocType) + { + case BMT_ALLOCFND: + fprintf(fp, "allocate (found)\n"); + break; + + case BMT_ALLOCNOTFND: + fprintf(fp, "allocate (not found)\n"); + break; + + case BMT_DEALLOC: + fprintf(fp, "deallocate\n"); + break; + + default: + fprintf(fp, "unknown op type %d\n", allocType); + break; + } + + FreeFile(fp); + + kill(getpid(), SIGILL); +} + +#endif /* BMTRACE */ + +/* + * SetBufferCommitInfoNeedsSave + * + * Mark a buffer dirty when we have updated tuple commit-status bits in it. + * + * This is similar to WriteNoReleaseBuffer, except that we do not set + * SharedBufferChanged or BufferDirtiedByMe, because we have not made a + * critical change that has to be flushed to disk before xact commit --- the + * status-bit update could be redone by someone else just as easily. The + * buffer will be marked dirty, but it will not be written to disk until + * there is another reason to write it. + * + * This routine might get called many times on the same page, if we are making + * the first scan after commit of an xact that added/deleted many tuples. + * So, be as quick as we can if the buffer is already dirty. + */ +void +SetBufferCommitInfoNeedsSave(Buffer buffer) +{ + BufferDesc *bufHdr; + + if (BufferIsLocal(buffer)) + return; + + if (BAD_BUFFER_ID(buffer)) + return; + + bufHdr = &BufferDescriptors[buffer - 1]; + + if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) != + (BM_DIRTY | BM_JUST_DIRTIED)) + { + SpinAcquire(BufMgrLock); + Assert(bufHdr->refcount > 0); + bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); + SpinRelease(BufMgrLock); + } +} + +void +UnlockBuffers() +{ + BufferDesc *buf; + int i; + + for (i = 0; i < NBuffers; i++) + { + if (BufferLocks[i] == 0) + continue; + + Assert(BufferIsValid(i + 1)); + buf = &(BufferDescriptors[i]); + +#ifdef HAS_TEST_AND_SET + S_LOCK(&(buf->cntx_lock)); +#else + IpcSemaphoreLock(WaitCLSemId, 0, IpcExclusiveLock); +#endif + + if (BufferLocks[i] & BL_R_LOCK) + { + Assert(buf->r_locks > 0); + (buf->r_locks)--; + } + if (BufferLocks[i] & BL_RI_LOCK) + { + + /* + * Someone else could remove our RI lock when acquiring W + * lock. This is possible if we came here from elog(ERROR) + * from IpcSemaphore{Lock|Unlock}(WaitCLSemId). And so we + * don't do Assert(buf->ri_lock) here. + */ + buf->ri_lock = false; + } + if (BufferLocks[i] & BL_W_LOCK) + { + Assert(buf->w_lock); + buf->w_lock = false; + } +#ifdef HAS_TEST_AND_SET + S_UNLOCK(&(buf->cntx_lock)); +#else + IpcSemaphoreUnlock(WaitCLSemId, 0, IpcExclusiveLock); +#endif + BufferLocks[i] = 0; + } +} + +void +LockBuffer(Buffer buffer, int mode) +{ + BufferDesc *buf; + bits8 *buflock; + + Assert(BufferIsValid(buffer)); + if (BufferIsLocal(buffer)) + return; + + buf = &(BufferDescriptors[buffer - 1]); + buflock = &(BufferLocks[buffer - 1]); + +#ifdef HAS_TEST_AND_SET + S_LOCK(&(buf->cntx_lock)); +#else + IpcSemaphoreLock(WaitCLSemId, 0, IpcExclusiveLock); +#endif + + if (mode == BUFFER_LOCK_UNLOCK) + { + if (*buflock & BL_R_LOCK) + { + Assert(buf->r_locks > 0); + Assert(!(buf->w_lock)); + Assert(!(*buflock & (BL_W_LOCK | BL_RI_LOCK))); + (buf->r_locks)--; + *buflock &= ~BL_R_LOCK; + } + else if (*buflock & BL_W_LOCK) + { + Assert(buf->w_lock); + Assert(buf->r_locks == 0); + Assert(!(*buflock & (BL_R_LOCK | BL_RI_LOCK))); + buf->w_lock = false; + *buflock &= ~BL_W_LOCK; + } + else + elog(ERROR, "UNLockBuffer: buffer %lu is not locked", buffer); + } + else if (mode == BUFFER_LOCK_SHARE) + { + unsigned i = 0; + + Assert(!(*buflock & (BL_R_LOCK | BL_W_LOCK | BL_RI_LOCK))); + while (buf->ri_lock || buf->w_lock) + { +#ifdef HAS_TEST_AND_SET + S_UNLOCK(&(buf->cntx_lock)); + s_lock_sleep(i++); + S_LOCK(&(buf->cntx_lock)); +#else + IpcSemaphoreUnlock(WaitCLSemId, 0, IpcExclusiveLock); + s_lock_sleep(i++); + IpcSemaphoreLock(WaitCLSemId, 0, IpcExclusiveLock); +#endif + } + (buf->r_locks)++; + *buflock |= BL_R_LOCK; + } + else if (mode == BUFFER_LOCK_EXCLUSIVE) + { + unsigned i = 0; + + Assert(!(*buflock & (BL_R_LOCK | BL_W_LOCK | BL_RI_LOCK))); + while (buf->r_locks > 0 || buf->w_lock) + { + if (buf->r_locks > 3 || (*buflock & BL_RI_LOCK)) + { + + /* + * Our RI lock might be removed by concurrent W lock + * acquiring (see what we do with RI locks below when our + * own W acquiring succeeded) and so we set RI lock again + * if we already did this. + */ + *buflock |= BL_RI_LOCK; + buf->ri_lock = true; + } +#ifdef HAS_TEST_AND_SET + S_UNLOCK(&(buf->cntx_lock)); + s_lock_sleep(i++); + S_LOCK(&(buf->cntx_lock)); +#else + IpcSemaphoreUnlock(WaitCLSemId, 0, IpcExclusiveLock); + s_lock_sleep(i++); + IpcSemaphoreLock(WaitCLSemId, 0, IpcExclusiveLock); +#endif + } + buf->w_lock = true; + *buflock |= BL_W_LOCK; + + buf->cntxDirty = true; + + if (*buflock & BL_RI_LOCK) + { + + /* + * It's possible to remove RI locks acquired by another W + * lockers here, but they'll take care about it. + */ + buf->ri_lock = false; + *buflock &= ~BL_RI_LOCK; + } + } + else + elog(ERROR, "LockBuffer: unknown lock mode %d", mode); + +#ifdef HAS_TEST_AND_SET + S_UNLOCK(&(buf->cntx_lock)); +#else + IpcSemaphoreUnlock(WaitCLSemId, 0, IpcExclusiveLock); +#endif + +} + +/* + * Functions for IO error handling + * + * Note : We assume that nested buffer IO never occur. + * i.e at most one io_in_progress spinlock is held + * per proc. +*/ +static BufferDesc *InProgressBuf = (BufferDesc *) NULL; +static bool IsForInput; + +/* + * Function:StartBufferIO + * (Assumptions) + * My process is executing no IO + * BufMgrLock is held + * BM_IO_IN_PROGRESS mask is not set for the buffer + * The buffer is Pinned + * +*/ +static void +StartBufferIO(BufferDesc *buf, bool forInput) +{ + Assert(!InProgressBuf); + Assert(!(buf->flags & BM_IO_IN_PROGRESS)); + buf->flags |= BM_IO_IN_PROGRESS; +#ifdef HAS_TEST_AND_SET + + /* + * There used to be + * + * Assert(S_LOCK_FREE(&(buf->io_in_progress_lock))); + * + * here, but that's wrong because of the way WaitIO works: someone else + * waiting for the I/O to complete will succeed in grabbing the lock + * for a few instructions, and if we context-swap back to here the + * Assert could fail. Tiny window for failure, but I've seen it + * happen -- tgl + */ + S_LOCK(&(buf->io_in_progress_lock)); +#endif /* HAS_TEST_AND_SET */ + InProgressBuf = buf; + IsForInput = forInput; +} + +/* + * Function:TerminateBufferIO + * (Assumptions) + * My process is executing IO for the buffer + * BufMgrLock is held + * The buffer is Pinned + * +*/ +static void +TerminateBufferIO(BufferDesc *buf) +{ + Assert(buf == InProgressBuf); +#ifdef HAS_TEST_AND_SET + S_UNLOCK(&(buf->io_in_progress_lock)); +#else + if (buf->refcount > 1) + SignalIO(buf); +#endif /* HAS_TEST_AND_SET */ + InProgressBuf = (BufferDesc *) 0; +} + +/* + * Function:ContinueBufferIO + * (Assumptions) + * My process is executing IO for the buffer + * BufMgrLock is held + * The buffer is Pinned + * +*/ +static void +ContinueBufferIO(BufferDesc *buf, bool forInput) +{ + Assert(buf == InProgressBuf); + Assert(buf->flags & BM_IO_IN_PROGRESS); + IsForInput = forInput; +} + +#ifdef NOT_USED +void +InitBufferIO(void) +{ + InProgressBuf = (BufferDesc *) 0; +} +#endif + +/* + * This function is called from ProcReleaseSpins(). + * BufMgrLock isn't held when this function is called. + * BM_IO_ERROR is always set. If BM_IO_ERROR was already + * set in case of output,this routine would kill all + * backends and reset postmaster. + */ +void +AbortBufferIO(void) +{ + BufferDesc *buf = InProgressBuf; + + if (buf) + { + Assert(buf->flags & BM_IO_IN_PROGRESS); + SpinAcquire(BufMgrLock); + if (IsForInput) + Assert(!(buf->flags & BM_DIRTY) && !(buf->cntxDirty)); + else + { + Assert(buf->flags & BM_DIRTY || buf->cntxDirty); + if (buf->flags & BM_IO_ERROR) + { + elog(NOTICE, "write error may be permanent: cannot write block %u for %s/%s", + buf->tag.blockNum, buf->blind.dbname, buf->blind.relname); + } + buf->flags |= BM_DIRTY; + } + buf->flags |= BM_IO_ERROR; + buf->flags &= ~BM_IO_IN_PROGRESS; + TerminateBufferIO(buf); + SpinRelease(BufMgrLock); + } +} + +/* + * Cleanup buffer or mark it for cleanup. Buffer may be cleaned + * up if it's pinned only once. + * + * NOTE: buffer must be excl locked. + */ +void +MarkBufferForCleanup(Buffer buffer, void (*CleanupFunc)(Buffer)) +{ + BufferDesc *bufHdr = &BufferDescriptors[buffer - 1]; + + Assert(PrivateRefCount[buffer - 1] > 0); + + if (PrivateRefCount[buffer - 1] > 1) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + PrivateRefCount[buffer - 1]--; + SpinAcquire(BufMgrLock); + Assert(bufHdr->refcount > 0); + bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); + bufHdr->CleanupFunc = CleanupFunc; + SpinRelease(BufMgrLock); + return; + } + + SpinAcquire(BufMgrLock); + Assert(bufHdr->refcount > 0); + if (bufHdr->refcount == 1) + { + SpinRelease(BufMgrLock); + CleanupFunc(buffer); + CleanupFunc = NULL; + } + else + SpinRelease(BufMgrLock); + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + PrivateRefCount[buffer - 1]--; + + SpinAcquire(BufMgrLock); + Assert(bufHdr->refcount > 0); + bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); + bufHdr->CleanupFunc = CleanupFunc; + bufHdr->refcount--; + if (bufHdr->refcount == 0) + { + AddBufferToFreelist(bufHdr); + bufHdr->flags |= BM_FREE; + } + SpinRelease(BufMgrLock); + return; +} |