diff options
Diffstat (limited to 'src/backend/storage')
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 214 | ||||
-rw-r--r-- | src/backend/storage/buffer/localbuf.c | 7 | ||||
-rw-r--r-- | src/backend/storage/freespace/README | 4 | ||||
-rw-r--r-- | src/backend/storage/freespace/freespace.c | 10 | ||||
-rw-r--r-- | src/backend/storage/freespace/fsmpage.c | 2 | ||||
-rw-r--r-- | src/backend/storage/page/bufpage.c | 183 |
6 files changed, 357 insertions, 63 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 0b4c2ed0a01..1cd0ac806a7 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -34,6 +34,7 @@ #include <unistd.h> #include "catalog/catalog.h" +#include "catalog/storage.h" #include "common/relpath.h" #include "executor/instrument.h" #include "miscadmin.h" @@ -431,6 +432,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, { /* new buffers are zero-filled */ MemSet((char *) bufBlock, 0, BLCKSZ); + /* don't set checksum for all-zero page */ smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false); } else @@ -460,13 +462,13 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } /* check for garbage data */ - if (!PageHeaderIsValid((PageHeader) bufBlock)) + if (!PageIsVerified((Page) bufBlock, blockNum)) { if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages) { ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("invalid page header in block %u of relation %s; zeroing out page", + errmsg("invalid page in block %u of relation %s; zeroing out page", blockNum, relpath(smgr->smgr_rnode, forkNum)))); MemSet((char *) bufBlock, 0, BLCKSZ); @@ -474,7 +476,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, else ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("invalid page header in block %u of relation %s", + errmsg("invalid page in block %u of relation %s", blockNum, relpath(smgr->smgr_rnode, forkNum)))); } @@ -655,14 +657,23 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * victim. We need lock to inspect the page LSN, so this * can't be done inside StrategyGetBuffer. */ - if (strategy != NULL && - XLogNeedsFlush(BufferGetLSN(buf)) && - StrategyRejectBuffer(strategy, buf)) + if (strategy != NULL) { - /* Drop lock/pin and loop around for another buffer */ - LWLockRelease(buf->content_lock); - UnpinBuffer(buf, true); - continue; + XLogRecPtr lsn; + + /* Read the LSN while holding buffer header lock */ + LockBufHdr(buf); + lsn = BufferGetLSN(buf); + UnlockBufHdr(buf); + + if (XLogNeedsFlush(lsn) && + StrategyRejectBuffer(strategy, buf)) + { + /* Drop lock/pin and loop around for another buffer */ + LWLockRelease(buf->content_lock); + UnpinBuffer(buf, true); + continue; + } } /* OK, do the I/O */ @@ -1906,6 +1917,8 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) ErrorContextCallback errcallback; instr_time io_start, io_time; + Block bufBlock; + char *bufToWrite; /* * Acquire the buffer's io_in_progress lock. If StartBufferIO returns @@ -1931,6 +1944,18 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode); + LockBufHdr(buf); + + /* + * Run PageGetLSN while holding header lock, since we don't have the + * buffer locked exclusively in all cases. + */ + recptr = BufferGetLSN(buf); + + /* To check if block content changes while flushing. - vadim 01/17/97 */ + buf->flags &= ~BM_JUST_DIRTIED; + UnlockBufHdr(buf); + /* * Force XLOG flush up to buffer's LSN. This implements the basic WAL * rule that log updates must hit disk before any of the data-file changes @@ -1949,10 +1974,7 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) * buffer isn't permanent. */ if (buf->flags & BM_PERMANENT) - { - recptr = BufferGetLSN(buf); XLogFlush(recptr); - } /* * Now it's safe to write buffer to disk. Note that no one else should @@ -1960,18 +1982,20 @@ FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln) * we have the io_in_progress lock. */ - /* To check if block content changes while flushing. - vadim 01/17/97 */ - LockBufHdr(buf); - buf->flags &= ~BM_JUST_DIRTIED; - UnlockBufHdr(buf); + bufBlock = BufHdrGetBlock(buf); + + bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum); if (track_io_timing) INSTR_TIME_SET_CURRENT(io_start); + /* + * bufToWrite is either the shared buffer or a copy, as appropriate. + */ smgrwrite(reln, buf->tag.forkNum, buf->tag.blockNum, - (char *) BufHdrGetBlock(buf), + bufToWrite, false); if (track_io_timing) @@ -2042,6 +2066,34 @@ BufferIsPermanent(Buffer buffer) return (bufHdr->flags & BM_PERMANENT) != 0; } +/* + * BufferGetLSNAtomic + * Retrieves the LSN of the buffer atomically using a buffer header lock. + * This is necessary for some callers who may not have an exclusive lock + * on the buffer. + */ +XLogRecPtr +BufferGetLSNAtomic(Buffer buffer) +{ + volatile BufferDesc *bufHdr = &BufferDescriptors[buffer - 1]; + char *page = BufferGetPage(buffer); + XLogRecPtr lsn; + + /* Local buffers don't need a lock. */ + if (BufferIsLocal(buffer)) + return PageGetLSN(page); + + /* Make sure we've got a real buffer, and that we hold a pin on it. */ + Assert(BufferIsValid(buffer)); + Assert(BufferIsPinned(buffer)); + + LockBufHdr(bufHdr); + lsn = PageGetLSN(page); + UnlockBufHdr(bufHdr); + + return lsn; +} + /* --------------------------------------------------------------------- * DropRelFileNodeBuffers * @@ -2343,7 +2395,10 @@ FlushRelationBuffers(Relation rel) if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) && (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY)) { - ErrorContextCallback errcallback; + ErrorContextCallback errcallback; + Page localpage; + + localpage = (char *) LocalBufHdrGetBlock(bufHdr); /* Setup error traceback support for ereport() */ errcallback.callback = local_buffer_write_error_callback; @@ -2351,10 +2406,12 @@ FlushRelationBuffers(Relation rel) errcallback.previous = error_context_stack; error_context_stack = &errcallback; + PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); + smgrwrite(rel->rd_smgr, bufHdr->tag.forkNum, bufHdr->tag.blockNum, - (char *) LocalBufHdrGetBlock(bufHdr), + localpage, false); bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); @@ -2509,22 +2566,24 @@ IncrBufferRefCount(Buffer buffer) } /* - * SetBufferCommitInfoNeedsSave + * MarkBufferDirtyHint * - * Mark a buffer dirty when we have updated tuple commit-status bits in it. + * Mark a buffer dirty for non-critical changes. * - * This is essentially the same as MarkBufferDirty, except that the caller - * might have only share-lock instead of exclusive-lock on the buffer's - * content lock. We preserve the distinction mainly as a way of documenting - * that the caller has not made a critical data change --- the status-bit - * update could be redone by someone else just as easily. Therefore, no WAL - * log record need be generated, whereas calls to MarkBufferDirty really ought - * to be associated with a WAL-entry-creating action. + * This is essentially the same as MarkBufferDirty, except: + * + * 1. The caller does not write WAL; so if checksums are enabled, we may need + * to write an XLOG_HINT WAL record to protect against torn pages. + * 2. The caller might have only share-lock instead of exclusive-lock on the + * buffer's content lock. + * 3. This function does not guarantee that the buffer is always marked dirty + * (due to a race condition), so it cannot be used for important changes. */ void -SetBufferCommitInfoNeedsSave(Buffer buffer) +MarkBufferDirtyHint(Buffer buffer) { volatile BufferDesc *bufHdr; + Page page = BufferGetPage(buffer); if (!BufferIsValid(buffer)) elog(ERROR, "bad buffer ID: %d", buffer); @@ -2544,28 +2603,105 @@ SetBufferCommitInfoNeedsSave(Buffer buffer) /* * This routine might get called many times on the same page, if we are * making the first scan after commit of an xact that added/deleted many - * tuples. So, be as quick as we can if the buffer is already dirty. We - * do this by not acquiring spinlock if it looks like the status bits are - * already. Since we make this test unlocked, there's a chance we might - * fail to notice that the flags have just been cleared, and failed to - * reset them, due to memory-ordering issues. But since this function is - * only intended to be used in cases where failing to write out the data + * tuples. So, be as quick as we can if the buffer is already dirty. We do + * this by not acquiring spinlock if it looks like the status bits are + * already set. Since we make this test unlocked, there's a chance we + * might fail to notice that the flags have just been cleared, and failed + * to reset them, due to memory-ordering issues. But since this function + * is only intended to be used in cases where failing to write out the data * would be harmless anyway, it doesn't really matter. */ if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) != (BM_DIRTY | BM_JUST_DIRTIED)) { + XLogRecPtr lsn = InvalidXLogRecPtr; + bool dirtied = false; + bool delayChkpt = false; + + /* + * If checksums are enabled, and the buffer is permanent, then a full + * page image may be required even for some hint bit updates to protect + * against torn pages. This full page image is only necessary if the + * hint bit update is the first change to the page since the last + * checkpoint. + * + * We don't check full_page_writes here because that logic is + * included when we call XLogInsert() since the value changes + * dynamically. + */ + if (DataChecksumsEnabled() && (bufHdr->flags & BM_PERMANENT)) + { + /* + * If we're in recovery we cannot dirty a page because of a hint. + * We can set the hint, just not dirty the page as a result so + * the hint is lost when we evict the page or shutdown. + * + * See src/backend/storage/page/README for longer discussion. + */ + if (RecoveryInProgress()) + return; + + /* + * If the block is already dirty because we either made a change + * or set a hint already, then we don't need to write a full page + * image. Note that aggressive cleaning of blocks + * dirtied by hint bit setting would increase the call rate. + * Bulk setting of hint bits would reduce the call rate... + * + * We must issue the WAL record before we mark the buffer dirty. + * Otherwise we might write the page before we write the WAL. + * That causes a race condition, since a checkpoint might occur + * between writing the WAL record and marking the buffer dirty. + * We solve that with a kluge, but one that is already in use + * during transaction commit to prevent race conditions. + * Basically, we simply prevent the checkpoint WAL record from + * being written until we have marked the buffer dirty. We don't + * start the checkpoint flush until we have marked dirty, so our + * checkpoint must flush the change to disk successfully or the + * checkpoint never gets written, so crash recovery will fix. + * + * It's possible we may enter here without an xid, so it is + * essential that CreateCheckpoint waits for virtual transactions + * rather than full transactionids. + */ + MyPgXact->delayChkpt = delayChkpt = true; + lsn = XLogSaveBufferForHint(buffer); + } + LockBufHdr(bufHdr); Assert(bufHdr->refcount > 0); if (!(bufHdr->flags & BM_DIRTY)) { - /* Do vacuum cost accounting */ + dirtied = true; /* Means "will be dirtied by this action" */ + + /* + * Set the page LSN if we wrote a backup block. We aren't + * supposed to set this when only holding a share lock but + * as long as we serialise it somehow we're OK. We choose to + * set LSN while holding the buffer header lock, which causes + * any reader of an LSN who holds only a share lock to also + * obtain a buffer header lock before using PageGetLSN(). + * Fortunately, thats not too many places. + * + * If checksums are enabled, you might think we should reset the + * checksum here. That will happen when the page is written + * sometime later in this checkpoint cycle. + */ + if (!XLogRecPtrIsInvalid(lsn)) + PageSetLSN(page, lsn); + } + bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); + UnlockBufHdr(bufHdr); + + if (delayChkpt) + MyPgXact->delayChkpt = false; + + if (dirtied) + { VacuumPageDirty++; if (VacuumCostActive) VacuumCostBalance += VacuumCostPageDirty; } - bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED); - UnlockBufHdr(bufHdr); } } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 03055c96d85..c67271a4bdf 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -196,16 +196,19 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, */ if (bufHdr->flags & BM_DIRTY) { - SMgrRelation oreln; + SMgrRelation oreln; + Page localpage = (char *) LocalBufHdrGetBlock(bufHdr); /* Find smgr relation for buffer */ oreln = smgropen(bufHdr->tag.rnode, MyBackendId); + PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); + /* And write... */ smgrwrite(oreln, bufHdr->tag.forkNum, bufHdr->tag.blockNum, - (char *) LocalBufHdrGetBlock(bufHdr), + localpage, false); /* Mark not-dirty now in case we error out below */ diff --git a/src/backend/storage/freespace/README b/src/backend/storage/freespace/README index 9732ad54d3e..bbd1b93fac6 100644 --- a/src/backend/storage/freespace/README +++ b/src/backend/storage/freespace/README @@ -169,7 +169,9 @@ Recovery -------- The FSM is not explicitly WAL-logged. Instead, we rely on a bunch of -self-correcting measures to repair possible corruption. +self-correcting measures to repair possible corruption. As a result when +we write to the FSM we treat that as a hint and thus use MarkBufferDirtyHint() +rather than MarkBufferDirty(). First of all, whenever a value is set on an FSM page, the root node of the page is compared against the new value after bubbling up the change is diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index 9c2afc5b0e8..b76bf9be6b4 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -216,7 +216,7 @@ XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, PageInit(page, BLCKSZ, 0); if (fsm_set_avail(page, slot, new_cat)) - MarkBufferDirty(buf); + MarkBufferDirtyHint(buf); UnlockReleaseBuffer(buf); } @@ -286,7 +286,7 @@ FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks) return; /* nothing to do; the FSM was already smaller */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); - MarkBufferDirty(buf); + MarkBufferDirtyHint(buf); UnlockReleaseBuffer(buf); new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1; @@ -583,6 +583,8 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks) while (fsm_nblocks_now < fsm_nblocks) { + PageSetChecksumInplace(pg, fsm_nblocks_now); + smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now, (char *) pg, false); fsm_nblocks_now++; @@ -617,7 +619,7 @@ fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, page = BufferGetPage(buf); if (fsm_set_avail(page, slot, newValue)) - MarkBufferDirty(buf); + MarkBufferDirtyHint(buf); if (minValue != 0) { @@ -768,7 +770,7 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p) { LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); fsm_set_avail(BufferGetPage(buf), slot, child_avail); - MarkBufferDirty(buf); + MarkBufferDirtyHint(buf); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } } diff --git a/src/backend/storage/freespace/fsmpage.c b/src/backend/storage/freespace/fsmpage.c index acb8038a870..19c8e09148b 100644 --- a/src/backend/storage/freespace/fsmpage.c +++ b/src/backend/storage/freespace/fsmpage.c @@ -284,7 +284,7 @@ restart: exclusive_lock_held = true; } fsm_rebuild_page(page); - MarkBufferDirty(buf); + MarkBufferDirtyHint(buf); goto restart; } } diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 95f3e16bfb1..81cdc6547a3 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -15,7 +15,14 @@ #include "postgres.h" #include "access/htup_details.h" +#include "access/xlog.h" +bool ignore_checksum_failure = false; + +static char pageCopyData[BLCKSZ]; /* for checksum calculation */ +static Page pageCopy = pageCopyData; + +static uint16 PageCalcChecksum16(Page page, BlockNumber blkno); /* ---------------------------------------------------------------- * Page support functions @@ -25,6 +32,8 @@ /* * PageInit * Initializes the contents of a page. + * Note that we don't calculate an initial checksum here; that's not done + * until it's time to write. */ void PageInit(Page page, Size pageSize, Size specialSize) @@ -39,7 +48,7 @@ PageInit(Page page, Size pageSize, Size specialSize) /* Make sure all fields of page are zero, as well as unused space */ MemSet(p, 0, pageSize); - /* p->pd_flags = 0; done by above MemSet */ + p->pd_flags = 0; p->pd_lower = SizeOfPageHeaderData; p->pd_upper = pageSize - specialSize; p->pd_special = pageSize - specialSize; @@ -49,8 +58,8 @@ PageInit(Page page, Size pageSize, Size specialSize) /* - * PageHeaderIsValid - * Check that the header fields of a page appear valid. + * PageIsVerified + * Check that the page header and checksum (if any) appear valid. * * This is called when a page has just been read in from disk. The idea is * to cheaply detect trashed pages before we go nuts following bogus item @@ -67,30 +76,77 @@ PageInit(Page page, Size pageSize, Size specialSize) * will clean up such a page and make it usable. */ bool -PageHeaderIsValid(PageHeader page) +PageIsVerified(Page page, BlockNumber blkno) { + PageHeader p = (PageHeader) page; char *pagebytes; int i; + bool checksum_failure = false; + bool header_sane = false; + bool all_zeroes = false; + uint16 checksum; - /* Check normal case */ - if (PageGetPageSize(page) == BLCKSZ && - PageGetPageLayoutVersion(page) == PG_PAGE_LAYOUT_VERSION && - (page->pd_flags & ~PD_VALID_FLAG_BITS) == 0 && - page->pd_lower >= SizeOfPageHeaderData && - page->pd_lower <= page->pd_upper && - page->pd_upper <= page->pd_special && - page->pd_special <= BLCKSZ && - page->pd_special == MAXALIGN(page->pd_special)) - return true; + /* + * Don't verify page data unless the page passes basic non-zero test + */ + if (!PageIsNew(page)) + { + if (DataChecksumsEnabled()) + { + checksum = PageCalcChecksum16(page, blkno); + + if (checksum != p->pd_checksum) + checksum_failure = true; + } + + /* + * The following checks don't prove the header is correct, + * only that it looks sane enough to allow into the buffer pool. + * Later usage of the block can still reveal problems, + * which is why we offer the checksum option. + */ + if ((p->pd_flags & ~PD_VALID_FLAG_BITS) == 0 && + p->pd_lower <= p->pd_upper && + p->pd_upper <= p->pd_special && + p->pd_special <= BLCKSZ && + p->pd_special == MAXALIGN(p->pd_special)) + header_sane = true; + + if (header_sane && !checksum_failure) + return true; + } /* Check all-zeroes case */ + all_zeroes = true; pagebytes = (char *) page; for (i = 0; i < BLCKSZ; i++) { if (pagebytes[i] != 0) - return false; + { + all_zeroes = false; + break; + } + } + + if (all_zeroes) + return true; + + /* + * Throw a WARNING if the checksum fails, but only after we've checked for + * the all-zeroes case. + */ + if (checksum_failure) + { + ereport(WARNING, + (ERRCODE_DATA_CORRUPTED, + errmsg("page verification failed, calculated checksum %u but expected %u", + checksum, p->pd_checksum))); + + if (header_sane && ignore_checksum_failure) + return true; } - return true; + + return false; } @@ -827,3 +883,98 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) pfree(itemidbase); } + +/* + * Set checksum for page in shared buffers. + * + * If checksums are disabled, or if the page is not initialized, just return + * the input. Otherwise, we must make a copy of the page before calculating the + * checksum, to prevent concurrent modifications (e.g. setting hint bits) from + * making the final checksum invalid. + * + * Returns a pointer to the block-sized data that needs to be written. Uses + * statically-allocated memory, so the caller must immediately write the + * returned page and not refer to it again. + */ +char * +PageSetChecksumCopy(Page page, BlockNumber blkno) +{ + if (PageIsNew(page) || !DataChecksumsEnabled()) + return (char *) page; + + /* + * We make a copy iff we need to calculate a checksum because other + * backends may set hint bits on this page while we write, which + * would mean the checksum differs from the page contents. It doesn't + * matter if we include or exclude hints during the copy, as long + * as we write a valid page and associated checksum. + */ + memcpy((char *) pageCopy, (char *) page, BLCKSZ); + PageSetChecksumInplace(pageCopy, blkno); + return (char *) pageCopy; +} + +/* + * Set checksum for page in private memory. + * + * This is a simpler version of PageSetChecksumCopy(). The more explicit API + * allows us to more easily see if we're making the correct call and reduces + * the amount of additional code specific to page verification. + */ +void +PageSetChecksumInplace(Page page, BlockNumber blkno) +{ + if (PageIsNew(page)) + return; + + if (DataChecksumsEnabled()) + { + PageHeader p = (PageHeader) page; + p->pd_checksum = PageCalcChecksum16(page, blkno); + } + + return; +} + +/* + * Calculate checksum for a PostgreSQL Page. This includes the block number (to + * detect the case when a page is somehow moved to a different location), the + * page header (excluding the checksum itself), and the page data. + * + * Note that if the checksum validation fails we cannot tell the difference + * between a transposed block and failure from direct on-block corruption, + * though that is better than just ignoring transposed blocks altogether. + */ +static uint16 +PageCalcChecksum16(Page page, BlockNumber blkno) +{ + pg_crc32 crc; + PageHeader p = (PageHeader) page; + + /* only calculate the checksum for properly-initialized pages */ + Assert(!PageIsNew(page)); + + INIT_CRC32(crc); + + /* + * Initialize the checksum calculation with the block number. This helps + * catch corruption from whole blocks being transposed with other whole + * blocks. + */ + COMP_CRC32(crc, &blkno, sizeof(blkno)); + + /* + * Now add in the LSN, which is always the first field on the page. + */ + COMP_CRC32(crc, page, sizeof(p->pd_lsn)); + + /* + * Now add the rest of the page, skipping the pd_checksum field. + */ + COMP_CRC32(crc, page + sizeof(p->pd_lsn) + sizeof(p->pd_checksum), + BLCKSZ - sizeof(p->pd_lsn) - sizeof(p->pd_checksum)); + + FIN_CRC32(crc); + + return (uint16) crc; +} |