diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2000-11-08 22:10:03 +0000 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2000-11-08 22:10:03 +0000 |
commit | 3908473c809d5c24940faebfabdad673f4302178 (patch) | |
tree | 6a1989499ee61771c7764afd2b24d12ebd25b8fb /src/backend/storage/buffer | |
parent | ebe0b236909732c75d665c73363bd4ac7a7aa138 (diff) | |
download | postgresql-3908473c809d5c24940faebfabdad673f4302178.tar.gz postgresql-3908473c809d5c24940faebfabdad673f4302178.zip |
Make DROP TABLE rollback-able: postpone physical file delete until commit.
(WAL logging for this is not done yet, however.) Clean up a number of really
crufty things that are no longer needed now that DROP behaves nicely. Make
temp table mapper do the right things when drop or rename affecting a temp
table is rolled back. Also, remove "relation modified while in use" error
check, in favor of locking tables at first reference and holding that lock
throughout the statement.
Diffstat (limited to 'src/backend/storage/buffer')
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 200 | ||||
-rw-r--r-- | src/backend/storage/buffer/localbuf.c | 17 | ||||
-rw-r--r-- | src/backend/storage/buffer/xlog_bufmgr.c | 142 |
3 files changed, 277 insertions, 82 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 9c9bda5035c..8d40e8d952f 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.92 2000/10/28 16:20:55 vadim Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.93 2000/11/08 22:09:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -709,23 +709,28 @@ refcount = %ld, file: %s, line: %d\n", #endif /* - * FlushBuffer -- like WriteBuffer, but force the page to disk. + * FlushBuffer -- like WriteBuffer, but write the page immediately, + * rather than just marking it dirty. On success return, the buffer will + * no longer be dirty. * * 'buffer' is known to be dirty/pinned, so there should not be a * problem reading the BufferDesc members without the BufMgrLock * (nobody should be able to change tags out from under us). * - * Unpin if 'release' is TRUE. + * If 'sync' is true, a synchronous write is wanted (wait for buffer to hit + * the disk). Otherwise it's sufficient to issue the kernel write call. + * + * Unpin buffer if 'release' is true. */ int -FlushBuffer(Buffer buffer, bool release) +FlushBuffer(Buffer buffer, bool sync, bool release) { BufferDesc *bufHdr; Relation bufrel; int status; if (BufferIsLocal(buffer)) - return FlushLocalBuffer(buffer, release) ? STATUS_OK : STATUS_ERROR; + return FlushLocalBuffer(buffer, sync, release) ? STATUS_OK : STATUS_ERROR; if (BAD_BUFFER_ID(buffer)) return STATUS_ERROR; @@ -755,12 +760,16 @@ FlushBuffer(Buffer buffer, bool release) */ LockBuffer(BufferDescriptorGetBuffer(bufHdr), BUFFER_LOCK_SHARE); - status = smgrflush(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); + if (sync) + status = smgrflush(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + else + status = smgrwrite(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); LockBuffer(BufferDescriptorGetBuffer(bufHdr), BUFFER_LOCK_UNLOCK); - /* drop relcache refcnt incremented by RelationIdCacheGetRelation */ + /* drop relcache refcnt incremented by RelationNodeCacheGetRelation */ RelationDecrementReferenceCount(bufrel); if (status == SM_FAIL) @@ -926,7 +935,7 @@ SetBufferDirtiedByMe(Buffer buffer, BufferDesc *bufHdr) /* * drop relcache refcnt incremented by - * RelationIdCacheGetRelation + * RelationNodeCacheGetRelation */ RelationDecrementReferenceCount(reln); } @@ -1123,7 +1132,7 @@ BufferSync() bufHdr->flags &= ~BM_DIRTY; } - /* drop refcnt obtained by RelationIdCacheGetRelation */ + /* drop refcnt obtained by RelationNodeCacheGetRelation */ if (reln != (Relation) NULL) RelationDecrementReferenceCount(reln); } @@ -1154,7 +1163,7 @@ BufferSync() /* * drop relcache refcnt incremented by - * RelationIdCacheGetRelation + * RelationNodeCacheGetRelation */ RelationDecrementReferenceCount(reln); @@ -1458,7 +1467,7 @@ BufferReplace(BufferDesc *bufHdr) SpinAcquire(BufMgrLock); - /* drop relcache refcnt incremented by RelationIdCacheGetRelation */ + /* drop relcache refcnt incremented by RelationNodeCacheGetRelation */ if (reln != (Relation) NULL) RelationDecrementReferenceCount(reln); @@ -1495,21 +1504,23 @@ RelationGetNumberOfBlocks(Relation relation) } /* --------------------------------------------------------------------- - * ReleaseRelationBuffers + * DropRelationBuffers * * This function removes all the buffered pages for a relation * from the buffer pool. Dirty pages are simply dropped, without - * bothering to write them out first. This is used when the - * relation is about to be deleted. We assume that the caller - * holds an exclusive lock on the relation, which should assure - * that no new buffers will be acquired for the rel meanwhile. + * bothering to write them out first. This is NOT rollback-able, + * and so should be used only with extreme caution! + * + * We assume that the caller holds an exclusive lock on the relation, + * which should assure that no new buffers will be acquired for the rel + * meanwhile. * * XXX currently it sequentially searches the buffer pool, should be * changed to more clever ways of searching. * -------------------------------------------------------------------- */ void -ReleaseRelationBuffers(Relation rel) +DropRelationBuffers(Relation rel) { int i; BufferDesc *bufHdr; @@ -1589,7 +1600,104 @@ recheck: * this rel, since we hold exclusive lock on this rel. */ if (RelFileNodeEquals(rel->rd_node, - BufferTagLastDirtied[i - 1].rnode)) + BufferTagLastDirtied[i - 1].rnode)) + BufferDirtiedByMe[i - 1] = false; + } + + SpinRelease(BufMgrLock); +} + +/* --------------------------------------------------------------------- + * DropRelFileNodeBuffers + * + * This is the same as DropRelationBuffers, except that the target + * relation is specified by RelFileNode. + * + * This is NOT rollback-able. One legitimate use is to clear the + * buffer cache of buffers for a relation that is being deleted + * during transaction abort. + * -------------------------------------------------------------------- + */ +void +DropRelFileNodeBuffers(RelFileNode rnode) +{ + int i; + BufferDesc *bufHdr; + + /* We have to search both local and shared buffers... */ + + for (i = 0; i < NLocBuffer; i++) + { + bufHdr = &LocalBufferDescriptors[i]; + if (RelFileNodeEquals(bufHdr->tag.rnode, rnode)) + { + bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); + LocalRefCount[i] = 0; + bufHdr->tag.rnode.relNode = InvalidOid; + } + } + + SpinAcquire(BufMgrLock); + for (i = 1; i <= NBuffers; i++) + { + bufHdr = &BufferDescriptors[i - 1]; +recheck: + if (RelFileNodeEquals(bufHdr->tag.rnode, rnode)) + { + + /* + * If there is I/O in progress, better wait till it's done; + * don't want to delete the relation out from under someone + * who's just trying to flush the buffer! + */ + if (bufHdr->flags & BM_IO_IN_PROGRESS) + { + WaitIO(bufHdr, BufMgrLock); + + /* + * By now, the buffer very possibly belongs to some other + * rel, so check again before proceeding. + */ + goto recheck; + } + /* Now we can do what we came for */ + bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); + + /* + * Release any refcount we may have. + * + * This is very probably dead code, and if it isn't then it's + * probably wrong. I added the Assert to find out --- tgl + * 11/99. + */ + if (!(bufHdr->flags & BM_FREE)) + { + /* Assert checks that buffer will actually get freed! */ + Assert(PrivateRefCount[i - 1] == 1 && + bufHdr->refcount == 1); + /* ReleaseBuffer expects we do not hold the lock at entry */ + SpinRelease(BufMgrLock); + ReleaseBuffer(i); + SpinAcquire(BufMgrLock); + } + /* + * And mark the buffer as no longer occupied by this rel. + */ + BufTableDelete(bufHdr); + } + + /* + * Also check to see if BufferDirtiedByMe info for this buffer + * refers to the target relation, and clear it if so. This is + * independent of whether the current contents of the buffer + * belong to the target relation! + * + * NOTE: we have no way to clear BufferDirtiedByMe info in other + * backends, but hopefully there are none with that bit set for + * this rel, since we hold exclusive lock on this rel. + */ + if (RelFileNodeEquals(rnode, + BufferTagLastDirtied[i - 1].rnode)) BufferDirtiedByMe[i - 1] = false; } @@ -1604,7 +1712,7 @@ recheck: * bothering to write them out first. This is used when we destroy a * database, to avoid trying to flush data to disk when the directory * tree no longer exists. Implementation is pretty similar to - * ReleaseRelationBuffers() which is for destroying just one relation. + * DropRelationBuffers() which is for destroying just one relation. * -------------------------------------------------------------------- */ void @@ -1757,33 +1865,32 @@ BufferPoolBlowaway() /* --------------------------------------------------------------------- * FlushRelationBuffers * - * This function flushes all dirty pages of a relation out to disk. + * This function writes all dirty pages of a relation out to disk. * Furthermore, pages that have blocknumber >= firstDelBlock are * actually removed from the buffer pool. An error code is returned * if we fail to dump a dirty buffer or if we find one of * the target pages is pinned into the cache. * - * This is used by VACUUM before truncating the relation to the given - * number of blocks. (TRUNCATE TABLE also uses it in the same way.) - * It might seem unnecessary to flush dirty pages before firstDelBlock, - * since VACUUM should already have committed its changes. However, - * it is possible for there still to be dirty pages: if some page - * had unwritten on-row tuple status updates from a prior transaction, - * and VACUUM had no additional changes to make to that page, then - * VACUUM won't have written it. This is harmless in most cases but - * will break pg_upgrade, which relies on VACUUM to ensure that *all* - * tuples have correct on-row status. So, we check and flush all - * dirty pages of the rel regardless of block number. - * - * This is also used by RENAME TABLE (with firstDelBlock = 0) - * to clear out the buffer cache before renaming the physical files of - * a relation. Without that, some other backend might try to do a - * blind write of a buffer page (relying on the BlindId of the buffer) - * and fail because it's not got the right filename anymore. + * This is called by DROP TABLE to clear buffers for the relation + * from the buffer pool. Note that we must write dirty buffers, + * rather than just dropping the changes, because our transaction + * might abort later on; we want to roll back safely in that case. + * + * This is also called by VACUUM before truncating the relation to the + * given number of blocks. It might seem unnecessary for VACUUM to + * write dirty pages before firstDelBlock, since VACUUM should already + * have committed its changes. However, it is possible for there still + * to be dirty pages: if some page had unwritten on-row tuple status + * updates from a prior transaction, and VACUUM had no additional + * changes to make to that page, then VACUUM won't have written it. + * This is harmless in most cases but will break pg_upgrade, which + * relies on VACUUM to ensure that *all* tuples have correct on-row + * status. So, we check and flush all dirty pages of the rel + * regardless of block number. * * In all cases, the caller should be holding AccessExclusiveLock on * the target relation to ensure that no other backend is busy reading - * more blocks of the relation. + * more blocks of the relation (or might do so before we commit). * * Formerly, we considered it an error condition if we found dirty * buffers here. However, since BufferSync no longer forces out all @@ -1812,7 +1919,7 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock) { if (bufHdr->flags & BM_DIRTY) { - if (FlushBuffer(-i - 1, false) != STATUS_OK) + if (FlushBuffer(-i - 1, false, false) != STATUS_OK) { elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it", RelationGetRelationName(rel), firstDelBlock, @@ -1840,15 +1947,17 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock) for (i = 0; i < NBuffers; i++) { bufHdr = &BufferDescriptors[i]; -recheck: if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) { if (bufHdr->flags & BM_DIRTY) { PinBuffer(bufHdr); SpinRelease(BufMgrLock); - if (FlushBuffer(i + 1, true) != STATUS_OK) + if (FlushBuffer(i + 1, false, false) != STATUS_OK) { + SpinAcquire(BufMgrLock); + UnpinBuffer(bufHdr); + SpinRelease(BufMgrLock); elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d), could not flush it", RelationGetRelationName(rel), firstDelBlock, bufHdr->tag.blockNum, @@ -1856,12 +1965,7 @@ recheck: return -1; } SpinAcquire(BufMgrLock); - - /* - * Buffer could already be reassigned, so must recheck - * whether it still belongs to rel before freeing it! - */ - goto recheck; + UnpinBuffer(bufHdr); } if (!(bufHdr->flags & BM_FREE)) { diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index faa3304b4f6..352f519bdc0 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -16,7 +16,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/buffer/localbuf.c,v 1.33 2000/10/28 16:20:56 vadim Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/localbuf.c,v 1.34 2000/11/08 22:09:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -183,7 +183,7 @@ WriteLocalBuffer(Buffer buffer, bool release) * flushes a local buffer */ int -FlushLocalBuffer(Buffer buffer, bool release) +FlushLocalBuffer(Buffer buffer, bool sync, bool release) { int bufid; Relation bufrel; @@ -199,13 +199,18 @@ FlushLocalBuffer(Buffer buffer, bool release) bufHdr = &LocalBufferDescriptors[bufid]; bufHdr->flags &= ~BM_DIRTY; bufrel = RelationNodeCacheGetRelation(bufHdr->tag.rnode); - Assert(bufrel != NULL); - smgrflush(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum, - (char *) MAKE_PTR(bufHdr->data)); + + if (sync) + smgrflush(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + else + smgrwrite(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum, + (char *) MAKE_PTR(bufHdr->data)); + LocalBufferFlushCount++; - /* drop relcache refcount incremented by RelationIdCacheGetRelation */ + /* drop relcache refcount incremented by RelationNodeCacheGetRelation */ RelationDecrementReferenceCount(bufrel); if (release) diff --git a/src/backend/storage/buffer/xlog_bufmgr.c b/src/backend/storage/buffer/xlog_bufmgr.c index dcd377b7eb3..15c4321405e 100644 --- a/src/backend/storage/buffer/xlog_bufmgr.c +++ b/src/backend/storage/buffer/xlog_bufmgr.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/xlog_bufmgr.c,v 1.1 2000/10/28 16:20:56 vadim Exp $ + * $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/xlog_bufmgr.c,v 1.2 2000/11/08 22:09:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -838,7 +838,7 @@ BufferSync() SpinRelease(BufMgrLock); - /* drop refcnt obtained by RelationIdCacheGetRelation */ + /* drop refcnt obtained by RelationNodeCacheGetRelation */ if (reln != (Relation) NULL) { RelationDecrementReferenceCount(reln); @@ -1128,7 +1128,7 @@ BufferReplace(BufferDesc *bufHdr) false); /* no fsync */ } - /* drop relcache refcnt incremented by RelationIdCacheGetRelation */ + /* drop relcache refcnt incremented by RelationNodeCacheGetRelation */ if (reln != (Relation) NULL) RelationDecrementReferenceCount(reln); @@ -1159,21 +1159,23 @@ RelationGetNumberOfBlocks(Relation relation) } /* --------------------------------------------------------------------- - * ReleaseRelationBuffers + * DropRelationBuffers * * This function removes all the buffered pages for a relation * from the buffer pool. Dirty pages are simply dropped, without - * bothering to write them out first. This is used when the - * relation is about to be deleted. We assume that the caller - * holds an exclusive lock on the relation, which should assure - * that no new buffers will be acquired for the rel meanwhile. + * bothering to write them out first. This is NOT rollback-able, + * and so should be used only with extreme caution! + * + * We assume that the caller holds an exclusive lock on the relation, + * which should assure that no new buffers will be acquired for the rel + * meanwhile. * * XXX currently it sequentially searches the buffer pool, should be * changed to more clever ways of searching. * -------------------------------------------------------------------- */ void -ReleaseRelationBuffers(Relation rel) +DropRelationBuffers(Relation rel) { int i; BufferDesc *bufHdr; @@ -1249,6 +1251,91 @@ recheck: } /* --------------------------------------------------------------------- + * DropRelFileNodeBuffers + * + * This is the same as DropRelationBuffers, except that the target + * relation is specified by RelFileNode. + * + * This is NOT rollback-able. One legitimate use is to clear the + * buffer cache of buffers for a relation that is being deleted + * during transaction abort. + * -------------------------------------------------------------------- + */ +void +DropRelFileNodeBuffers(RelFileNode rnode) +{ + int i; + BufferDesc *bufHdr; + + /* We have to search both local and shared buffers... */ + + for (i = 0; i < NLocBuffer; i++) + { + bufHdr = &LocalBufferDescriptors[i]; + if (RelFileNodeEquals(bufHdr->tag.rnode, rnode)) + { + bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); + bufHdr->cntxDirty = false; + LocalRefCount[i] = 0; + bufHdr->tag.rnode.relNode = InvalidOid; + } + } + + SpinAcquire(BufMgrLock); + for (i = 1; i <= NBuffers; i++) + { + bufHdr = &BufferDescriptors[i - 1]; +recheck: + if (RelFileNodeEquals(bufHdr->tag.rnode, rnode)) + { + + /* + * If there is I/O in progress, better wait till it's done; + * don't want to delete the relation out from under someone + * who's just trying to flush the buffer! + */ + if (bufHdr->flags & BM_IO_IN_PROGRESS) + { + WaitIO(bufHdr, BufMgrLock); + + /* + * By now, the buffer very possibly belongs to some other + * rel, so check again before proceeding. + */ + goto recheck; + } + /* Now we can do what we came for */ + bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); + bufHdr->cntxDirty = false; + + /* + * Release any refcount we may have. + * + * This is very probably dead code, and if it isn't then it's + * probably wrong. I added the Assert to find out --- tgl + * 11/99. + */ + if (!(bufHdr->flags & BM_FREE)) + { + /* Assert checks that buffer will actually get freed! */ + Assert(PrivateRefCount[i - 1] == 1 && + bufHdr->refcount == 1); + /* ReleaseBuffer expects we do not hold the lock at entry */ + SpinRelease(BufMgrLock); + ReleaseBuffer(i); + SpinAcquire(BufMgrLock); + } + /* + * And mark the buffer as no longer occupied by this rel. + */ + BufTableDelete(bufHdr); + } + } + + SpinRelease(BufMgrLock); +} + +/* --------------------------------------------------------------------- * DropBuffers * * This function removes all the buffers in the buffer cache for a @@ -1256,7 +1343,7 @@ recheck: * bothering to write them out first. This is used when we destroy a * database, to avoid trying to flush data to disk when the directory * tree no longer exists. Implementation is pretty similar to - * ReleaseRelationBuffers() which is for destroying just one relation. + * DropRelationBuffers() which is for destroying just one relation. * -------------------------------------------------------------------- */ void @@ -1399,33 +1486,32 @@ BufferPoolBlowaway() /* --------------------------------------------------------------------- * FlushRelationBuffers * - * This function flushes all dirty pages of a relation out to disk. + * This function writes all dirty pages of a relation out to disk. * Furthermore, pages that have blocknumber >= firstDelBlock are * actually removed from the buffer pool. An error code is returned * if we fail to dump a dirty buffer or if we find one of * the target pages is pinned into the cache. * - * This is used by VACUUM before truncating the relation to the given - * number of blocks. (TRUNCATE TABLE also uses it in the same way.) - * It might seem unnecessary to flush dirty pages before firstDelBlock, - * since VACUUM should already have committed its changes. However, - * it is possible for there still to be dirty pages: if some page - * had unwritten on-row tuple status updates from a prior transaction, - * and VACUUM had no additional changes to make to that page, then - * VACUUM won't have written it. This is harmless in most cases but - * will break pg_upgrade, which relies on VACUUM to ensure that *all* - * tuples have correct on-row status. So, we check and flush all - * dirty pages of the rel regardless of block number. + * This is called by DROP TABLE to clear buffers for the relation + * from the buffer pool. Note that we must write dirty buffers, + * rather than just dropping the changes, because our transaction + * might abort later on; we want to roll back safely in that case. * - * This is also used by RENAME TABLE (with firstDelBlock = 0) - * to clear out the buffer cache before renaming the physical files of - * a relation. Without that, some other backend might try to do a - * blind write of a buffer page (relying on the BlindId of the buffer) - * and fail because it's not got the right filename anymore. + * This is also called by VACUUM before truncating the relation to the + * given number of blocks. It might seem unnecessary for VACUUM to + * write dirty pages before firstDelBlock, since VACUUM should already + * have committed its changes. However, it is possible for there still + * to be dirty pages: if some page had unwritten on-row tuple status + * updates from a prior transaction, and VACUUM had no additional + * changes to make to that page, then VACUUM won't have written it. + * This is harmless in most cases but will break pg_upgrade, which + * relies on VACUUM to ensure that *all* tuples have correct on-row + * status. So, we check and flush all dirty pages of the rel + * regardless of block number. * * In all cases, the caller should be holding AccessExclusiveLock on * the target relation to ensure that no other backend is busy reading - * more blocks of the relation. + * more blocks of the relation (or might do so before we commit). * * Formerly, we considered it an error condition if we found dirty * buffers here. However, since BufferSync no longer forces out all |