diff options
Diffstat (limited to 'src/backend/storage/buffer/bufmgr.c')
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 794 |
1 files changed, 629 insertions, 165 deletions
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index af073e9a395..92714a9fe2d 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -48,6 +48,7 @@ #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "storage/ipc.h" +#include "storage/lmgr.h" #include "storage/proc.h" #include "storage/smgr.h" #include "storage/standby.h" @@ -450,6 +451,22 @@ static Buffer ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit); +static BlockNumber ExtendBufferedRelCommon(ExtendBufferedWhat eb, + ForkNumber fork, + BufferAccessStrategy strategy, + uint32 flags, + uint32 extend_by, + BlockNumber extend_upto, + Buffer *buffers, + uint32 *extended_by); +static BlockNumber ExtendBufferedRelShared(ExtendBufferedWhat eb, + ForkNumber fork, + BufferAccessStrategy strategy, + uint32 flags, + uint32 extend_by, + BlockNumber extend_upto, + Buffer *buffers, + uint32 *extended_by); static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy); static void PinBuffer_Locked(BufferDesc *buf); static void UnpinBuffer(BufferDesc *buf); @@ -785,6 +802,180 @@ ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, mode, strategy, &hit); } +/* + * Convenience wrapper around ExtendBufferedRelBy() extending by one block. + */ +Buffer +ExtendBufferedRel(ExtendBufferedWhat eb, + ForkNumber forkNum, + BufferAccessStrategy strategy, + uint32 flags) +{ + Buffer buf; + uint32 extend_by = 1; + + ExtendBufferedRelBy(eb, forkNum, strategy, flags, extend_by, + &buf, &extend_by); + + return buf; +} + +/* + * Extend relation by multiple blocks. + * + * Tries to extend the relation by extend_by blocks. Depending on the + * availability of resources the relation may end up being extended by a + * smaller number of pages (unless an error is thrown, always by at least one + * page). *extended_by is updated to the number of pages the relation has been + * extended to. + * + * buffers needs to be an array that is at least extend_by long. Upon + * completion, the first extend_by array elements will point to a pinned + * buffer. + * + * If EB_LOCK_FIRST is part of flags, the first returned buffer is + * locked. This is useful for callers that want a buffer that is guaranteed to + * be empty. + */ +BlockNumber +ExtendBufferedRelBy(ExtendBufferedWhat eb, + ForkNumber fork, + BufferAccessStrategy strategy, + uint32 flags, + uint32 extend_by, + Buffer *buffers, + uint32 *extended_by) +{ + Assert((eb.rel != NULL) != (eb.smgr != NULL)); + Assert(eb.smgr == NULL || eb.relpersistence != 0); + Assert(extend_by > 0); + + if (eb.smgr == NULL) + { + eb.smgr = RelationGetSmgr(eb.rel); + eb.relpersistence = eb.rel->rd_rel->relpersistence; + } + + return ExtendBufferedRelCommon(eb, fork, strategy, flags, + extend_by, InvalidBlockNumber, + buffers, extended_by); +} + +/* + * Extend the relation so it is at least extend_to blocks large, return buffer + * (extend_to - 1). + * + * This is useful for callers that want to write a specific page, regardless + * of the current size of the relation (e.g. useful for visibilitymap and for + * crash recovery). + */ +Buffer +ExtendBufferedRelTo(ExtendBufferedWhat eb, + ForkNumber fork, + BufferAccessStrategy strategy, + uint32 flags, + BlockNumber extend_to, + ReadBufferMode mode) +{ + BlockNumber current_size; + uint32 extended_by = 0; + Buffer buffer = InvalidBuffer; + Buffer buffers[64]; + + Assert((eb.rel != NULL) != (eb.smgr != NULL)); + Assert(eb.smgr == NULL || eb.relpersistence != 0); + Assert(extend_to != InvalidBlockNumber && extend_to > 0); + Assert(mode == RBM_NORMAL || mode == RBM_ZERO_ON_ERROR || + mode == RBM_ZERO_AND_LOCK); + + if (eb.smgr == NULL) + { + eb.smgr = RelationGetSmgr(eb.rel); + eb.relpersistence = eb.rel->rd_rel->relpersistence; + } + + /* + * If desired, create the file if it doesn't exist. If + * smgr_cached_nblocks[fork] is positive then it must exist, no need for + * an smgrexists call. + */ + if ((flags & EB_CREATE_FORK_IF_NEEDED) && + (eb.smgr->smgr_cached_nblocks[fork] == 0 || + eb.smgr->smgr_cached_nblocks[fork] == InvalidBlockNumber) && + !smgrexists(eb.smgr, fork)) + { + LockRelationForExtension(eb.rel, ExclusiveLock); + + /* could have been closed while waiting for lock */ + if (eb.rel) + eb.smgr = RelationGetSmgr(eb.rel); + + /* recheck, fork might have been created concurrently */ + if (!smgrexists(eb.smgr, fork)) + smgrcreate(eb.smgr, fork, flags & EB_PERFORMING_RECOVERY); + + UnlockRelationForExtension(eb.rel, ExclusiveLock); + } + + /* + * If requested, invalidate size cache, so that smgrnblocks asks the + * kernel. + */ + if (flags & EB_CLEAR_SIZE_CACHE) + eb.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber; + + /* + * Estimate how many pages we'll need to extend by. This avoids acquiring + * unnecessarily many victim buffers. + */ + current_size = smgrnblocks(eb.smgr, fork); + + if (mode == RBM_ZERO_AND_LOCK) + flags |= EB_LOCK_TARGET; + + while (current_size < extend_to) + { + uint32 num_pages = lengthof(buffers); + BlockNumber first_block; + + if ((uint64) current_size + num_pages > extend_to) + num_pages = extend_to - current_size; + + first_block = ExtendBufferedRelCommon(eb, fork, strategy, flags, + num_pages, extend_to, + buffers, &extended_by); + + current_size = first_block + extended_by; + Assert(current_size <= extend_to); + Assert(num_pages != 0 || current_size >= extend_to); + + for (int i = 0; i < extended_by; i++) + { + if (first_block + i != extend_to - 1) + ReleaseBuffer(buffers[i]); + else + buffer = buffers[i]; + } + } + + /* + * It's possible that another backend concurrently extended the relation. + * In that case read the buffer. + * + * XXX: Should we control this via a flag? + */ + if (buffer == InvalidBuffer) + { + bool hit; + + Assert(extended_by == 0); + buffer = ReadBuffer_common(eb.smgr, eb.relpersistence, + fork, extend_to - 1, mode, strategy, + &hit); + } + + return buffer; +} /* * ReadBuffer_common -- common logic for all ReadBuffer variants @@ -801,35 +992,38 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, bool found; IOContext io_context; IOObject io_object; - bool isExtend; bool isLocalBuf = SmgrIsTemp(smgr); *hit = false; + /* + * Backward compatibility path, most code should use ExtendBufferedRel() + * instead, as acquiring the extension lock inside ExtendBufferedRel() + * scales a lot better. + */ + if (unlikely(blockNum == P_NEW)) + { + uint32 flags = EB_SKIP_EXTENSION_LOCK; + + Assert(mode == RBM_NORMAL || + mode == RBM_ZERO_AND_LOCK || + mode == RBM_ZERO_ON_ERROR); + + if (mode == RBM_ZERO_AND_LOCK) + flags |= EB_LOCK_FIRST; + + return ExtendBufferedRel(EB_SMGR(smgr, relpersistence), + forkNum, strategy, flags); + } + /* Make sure we will have room to remember the buffer pin */ ResourceOwnerEnlargeBuffers(CurrentResourceOwner); - isExtend = (blockNum == P_NEW); - TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum, smgr->smgr_rlocator.locator.spcOid, smgr->smgr_rlocator.locator.dbOid, smgr->smgr_rlocator.locator.relNumber, - smgr->smgr_rlocator.backend, - isExtend); - - /* Substitute proper block number if caller asked for P_NEW */ - if (isExtend) - { - blockNum = smgrnblocks(smgr, forkNum); - /* Fail if relation is already at maximum possible length */ - if (blockNum == P_NEW) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("cannot extend relation %s beyond %u blocks", - relpath(smgr->smgr_rlocator, forkNum), - P_NEW))); - } + smgr->smgr_rlocator.backend); if (isLocalBuf) { @@ -844,8 +1038,6 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found); if (found) pgBufferUsage.local_blks_hit++; - else if (isExtend) - pgBufferUsage.local_blks_written++; else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG || mode == RBM_ZERO_ON_ERROR) pgBufferUsage.local_blks_read++; @@ -862,8 +1054,6 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, strategy, &found, io_context); if (found) pgBufferUsage.shared_blks_hit++; - else if (isExtend) - pgBufferUsage.shared_blks_written++; else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG || mode == RBM_ZERO_ON_ERROR) pgBufferUsage.shared_blks_read++; @@ -874,175 +1064,91 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, /* if it was already in the buffer pool, we're done */ if (found) { - if (!isExtend) - { - /* Just need to update stats before we exit */ - *hit = true; - VacuumPageHit++; - pgstat_count_io_op(io_object, io_context, IOOP_HIT); - - if (VacuumCostActive) - VacuumCostBalance += VacuumCostPageHit; - - TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum, - smgr->smgr_rlocator.locator.spcOid, - smgr->smgr_rlocator.locator.dbOid, - smgr->smgr_rlocator.locator.relNumber, - smgr->smgr_rlocator.backend, - isExtend, - found); + /* Just need to update stats before we exit */ + *hit = true; + VacuumPageHit++; + pgstat_count_io_op(io_object, io_context, IOOP_HIT); - /* - * In RBM_ZERO_AND_LOCK mode the caller expects the page to be - * locked on return. - */ - if (!isLocalBuf) - { - if (mode == RBM_ZERO_AND_LOCK) - LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), - LW_EXCLUSIVE); - else if (mode == RBM_ZERO_AND_CLEANUP_LOCK) - LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr)); - } - - return BufferDescriptorGetBuffer(bufHdr); - } + if (VacuumCostActive) + VacuumCostBalance += VacuumCostPageHit; - /* - * We get here only in the corner case where we are trying to extend - * the relation but we found a pre-existing buffer marked BM_VALID. - * This can happen because mdread doesn't complain about reads beyond - * EOF (when zero_damaged_pages is ON) and so a previous attempt to - * read a block beyond EOF could have left a "valid" zero-filled - * buffer. Unfortunately, we have also seen this case occurring - * because of buggy Linux kernels that sometimes return an - * lseek(SEEK_END) result that doesn't account for a recent write. In - * that situation, the pre-existing buffer would contain valid data - * that we don't want to overwrite. Since the legitimate case should - * always have left a zero-filled buffer, complain if not PageIsNew. - */ - bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr); - if (!PageIsNew((Page) bufBlock)) - ereport(ERROR, - (errmsg("unexpected data beyond EOF in block %u of relation %s", - blockNum, relpath(smgr->smgr_rlocator, forkNum)), - errhint("This has been seen to occur with buggy kernels; consider updating your system."))); + TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum, + smgr->smgr_rlocator.locator.spcOid, + smgr->smgr_rlocator.locator.dbOid, + smgr->smgr_rlocator.locator.relNumber, + smgr->smgr_rlocator.backend, + found); /* - * We *must* do smgrextend before succeeding, else the page will not - * be reserved by the kernel, and the next P_NEW call will decide to - * return the same page. Clear the BM_VALID bit, do the StartBufferIO - * call that BufferAlloc didn't, and proceed. + * In RBM_ZERO_AND_LOCK mode the caller expects the page to be locked + * on return. */ - if (isLocalBuf) + if (!isLocalBuf) { - /* Only need to adjust flags */ - uint32 buf_state = pg_atomic_read_u32(&bufHdr->state); - - Assert(buf_state & BM_VALID); - buf_state &= ~BM_VALID; - pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); + if (mode == RBM_ZERO_AND_LOCK) + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), + LW_EXCLUSIVE); + else if (mode == RBM_ZERO_AND_CLEANUP_LOCK) + LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr)); } - else - { - /* - * Loop to handle the very small possibility that someone re-sets - * BM_VALID between our clearing it and StartBufferIO inspecting - * it. - */ - do - { - uint32 buf_state = LockBufHdr(bufHdr); - Assert(buf_state & BM_VALID); - buf_state &= ~BM_VALID; - UnlockBufHdr(bufHdr, buf_state); - } while (!StartBufferIO(bufHdr, true)); - } + return BufferDescriptorGetBuffer(bufHdr); } /* * if we have gotten to this point, we have allocated a buffer for the * page but its contents are not yet valid. IO_IN_PROGRESS is set for it, * if it's a shared buffer. - * - * Note: if smgrextend fails, we will end up with a buffer that is - * allocated but not marked BM_VALID. P_NEW will still select the same - * block number (because the relation didn't get any longer on disk) and - * so future attempts to extend the relation will find the same buffer (if - * it's not been recycled) but come right back here to try smgrextend - * again. */ Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */ bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr); - if (isExtend) - { - /* new buffers are zero-filled */ + /* + * Read in the page, unless the caller intends to overwrite it and just + * wants us to allocate a buffer. + */ + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) MemSet((char *) bufBlock, 0, BLCKSZ); - /* don't set checksum for all-zero page */ - smgrextend(smgr, forkNum, blockNum, bufBlock, false); - - pgstat_count_io_op(io_object, io_context, IOOP_EXTEND); - - /* - * NB: we're *not* doing a ScheduleBufferTagForWriteback here; - * although we're essentially performing a write. At least on linux - * doing so defeats the 'delayed allocation' mechanism, leading to - * increased file fragmentation. - */ - } else { - /* - * Read in the page, unless the caller intends to overwrite it and - * just wants us to allocate a buffer. - */ - if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) - MemSet((char *) bufBlock, 0, BLCKSZ); - else - { - instr_time io_start, - io_time; + instr_time io_start, + io_time; - if (track_io_timing) - INSTR_TIME_SET_CURRENT(io_start); - else - INSTR_TIME_SET_ZERO(io_start); + if (track_io_timing) + INSTR_TIME_SET_CURRENT(io_start); - smgrread(smgr, forkNum, blockNum, bufBlock); + smgrread(smgr, forkNum, blockNum, bufBlock); - pgstat_count_io_op(io_object, io_context, IOOP_READ); + if (track_io_timing) + { + INSTR_TIME_SET_CURRENT(io_time); + INSTR_TIME_SUBTRACT(io_time, io_start); + pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time)); + INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time); + } - if (track_io_timing) - { - INSTR_TIME_SET_CURRENT(io_time); - INSTR_TIME_SUBTRACT(io_time, io_start); - pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time)); - INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time); - } + pgstat_count_io_op(io_object, io_context, IOOP_READ); - /* check for garbage data */ - if (!PageIsVerifiedExtended((Page) bufBlock, blockNum, - PIV_LOG_WARNING | PIV_REPORT_STAT)) + /* check for garbage data */ + if (!PageIsVerifiedExtended((Page) bufBlock, blockNum, + PIV_LOG_WARNING | PIV_REPORT_STAT)) + { + if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages) { - if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages) - { - ereport(WARNING, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("invalid page in block %u of relation %s; zeroing out page", - blockNum, - relpath(smgr->smgr_rlocator, forkNum)))); - MemSet((char *) bufBlock, 0, BLCKSZ); - } - else - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("invalid page in block %u of relation %s", - blockNum, - relpath(smgr->smgr_rlocator, forkNum)))); + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid page in block %u of relation %s; zeroing out page", + blockNum, + relpath(smgr->smgr_rlocator, forkNum)))); + MemSet((char *) bufBlock, 0, BLCKSZ); } + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid page in block %u of relation %s", + blockNum, + relpath(smgr->smgr_rlocator, forkNum)))); } } @@ -1085,7 +1191,6 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, smgr->smgr_rlocator.locator.dbOid, smgr->smgr_rlocator.locator.relNumber, smgr->smgr_rlocator.backend, - isExtend, found); return BufferDescriptorGetBuffer(bufHdr); @@ -1219,8 +1324,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, UnpinBuffer(victim_buf_hdr); /* - * The victim buffer we acquired peviously is clean and unused, - * let it be found again quickly + * The victim buffer we acquired peviously is clean and unused, let it + * be found again quickly */ StrategyFreeBuffer(victim_buf_hdr); @@ -1634,6 +1739,365 @@ again: } /* + * Limit the number of pins a batch operation may additionally acquire, to + * avoid running out of pinnable buffers. + * + * One additional pin is always allowed, as otherwise the operation likely + * cannot be performed at all. + * + * The number of allowed pins for a backend is computed based on + * shared_buffers and the maximum number of connections possible. That's very + * pessimistic, but outside of toy-sized shared_buffers it should allow + * sufficient pins. + */ +static void +LimitAdditionalPins(uint32 *additional_pins) +{ + uint32 max_backends; + int max_proportional_pins; + + if (*additional_pins <= 1) + return; + + max_backends = MaxBackends + NUM_AUXILIARY_PROCS; + max_proportional_pins = NBuffers / max_backends; + + /* + * Subtract the approximate number of buffers already pinned by this + * backend. We get the number of "overflowed" pins for free, but don't + * know the number of pins in PrivateRefCountArray. The cost of + * calculating that exactly doesn't seem worth it, so just assume the max. + */ + max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES; + + if (max_proportional_pins < 0) + max_proportional_pins = 1; + + if (*additional_pins > max_proportional_pins) + *additional_pins = max_proportional_pins; +} + +/* + * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to + * avoid duplicating the tracing and relpersistence related logic. + */ +static BlockNumber +ExtendBufferedRelCommon(ExtendBufferedWhat eb, + ForkNumber fork, + BufferAccessStrategy strategy, + uint32 flags, + uint32 extend_by, + BlockNumber extend_upto, + Buffer *buffers, + uint32 *extended_by) +{ + BlockNumber first_block; + + TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork, + eb.smgr->smgr_rlocator.locator.spcOid, + eb.smgr->smgr_rlocator.locator.dbOid, + eb.smgr->smgr_rlocator.locator.relNumber, + eb.smgr->smgr_rlocator.backend, + extend_by); + + if (eb.relpersistence == RELPERSISTENCE_TEMP) + first_block = ExtendBufferedRelLocal(eb, fork, flags, + extend_by, extend_upto, + buffers, &extend_by); + else + first_block = ExtendBufferedRelShared(eb, fork, strategy, flags, + extend_by, extend_upto, + buffers, &extend_by); + *extended_by = extend_by; + + TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork, + eb.smgr->smgr_rlocator.locator.spcOid, + eb.smgr->smgr_rlocator.locator.dbOid, + eb.smgr->smgr_rlocator.locator.relNumber, + eb.smgr->smgr_rlocator.backend, + *extended_by, + first_block); + + return first_block; +} + +/* + * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for + * shared buffers. + */ +static BlockNumber +ExtendBufferedRelShared(ExtendBufferedWhat eb, + ForkNumber fork, + BufferAccessStrategy strategy, + uint32 flags, + uint32 extend_by, + BlockNumber extend_upto, + Buffer *buffers, + uint32 *extended_by) +{ + BlockNumber first_block; + IOContext io_context = IOContextForStrategy(strategy); + + LimitAdditionalPins(&extend_by); + + /* + * Acquire victim buffers for extension without holding extension lock. + * Writing out victim buffers is the most expensive part of extending the + * relation, particularly when doing so requires WAL flushes. Zeroing out + * the buffers is also quite expensive, so do that before holding the + * extension lock as well. + * + * These pages are pinned by us and not valid. While we hold the pin they + * can't be acquired as victim buffers by another backend. + */ + for (uint32 i = 0; i < extend_by; i++) + { + Block buf_block; + + buffers[i] = GetVictimBuffer(strategy, io_context); + buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1)); + + /* new buffers are zero-filled */ + MemSet((char *) buf_block, 0, BLCKSZ); + } + + /* in case we need to pin an existing buffer below */ + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + + /* + * Lock relation against concurrent extensions, unless requested not to. + * + * We use the same extension lock for all forks. That's unnecessarily + * restrictive, but currently extensions for forks don't happen often + * enough to make it worth locking more granularly. + * + * Note that another backend might have extended the relation by the time + * we get the lock. + */ + if (!(flags & EB_SKIP_EXTENSION_LOCK)) + { + LockRelationForExtension(eb.rel, ExclusiveLock); + if (eb.rel) + eb.smgr = RelationGetSmgr(eb.rel); + } + + /* + * If requested, invalidate size cache, so that smgrnblocks asks the + * kernel. + */ + if (flags & EB_CLEAR_SIZE_CACHE) + eb.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber; + + first_block = smgrnblocks(eb.smgr, fork); + + /* + * Now that we have the accurate relation size, check if the caller wants + * us to extend to only up to a specific size. If there were concurrent + * extensions, we might have acquired too many buffers and need to release + * them. + */ + if (extend_upto != InvalidBlockNumber) + { + uint32 orig_extend_by = extend_by; + + if (first_block > extend_upto) + extend_by = 0; + else if ((uint64) first_block + extend_by > extend_upto) + extend_by = extend_upto - first_block; + + for (uint32 i = extend_by; i < orig_extend_by; i++) + { + BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1); + + /* + * The victim buffer we acquired peviously is clean and unused, + * let it be found again quickly + */ + StrategyFreeBuffer(buf_hdr); + UnpinBuffer(buf_hdr); + } + + if (extend_by == 0) + { + if (!(flags & EB_SKIP_EXTENSION_LOCK)) + UnlockRelationForExtension(eb.rel, ExclusiveLock); + *extended_by = extend_by; + return first_block; + } + } + + /* Fail if relation is already at maximum possible length */ + if ((uint64) first_block + extend_by >= MaxBlockNumber) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot extend relation %s beyond %u blocks", + relpath(eb.smgr->smgr_rlocator, fork), + MaxBlockNumber))); + + /* + * Insert buffers into buffer table, mark as IO_IN_PROGRESS. + * + * This needs to happen before we extend the relation, because as soon as + * we do, other backends can start to read in those pages. + */ + for (int i = 0; i < extend_by; i++) + { + Buffer victim_buf = buffers[i]; + BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1); + BufferTag tag; + uint32 hash; + LWLock *partition_lock; + int existing_id; + + InitBufferTag(&tag, &eb.smgr->smgr_rlocator.locator, fork, first_block + i); + hash = BufTableHashCode(&tag); + partition_lock = BufMappingPartitionLock(hash); + + LWLockAcquire(partition_lock, LW_EXCLUSIVE); + + existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id); + + /* + * We get here only in the corner case where we are trying to extend + * the relation but we found a pre-existing buffer. This can happen + * because a prior attempt at extending the relation failed, and + * because mdread doesn't complain about reads beyond EOF (when + * zero_damaged_pages is ON) and so a previous attempt to read a block + * beyond EOF could have left a "valid" zero-filled buffer. + * Unfortunately, we have also seen this case occurring because of + * buggy Linux kernels that sometimes return an lseek(SEEK_END) result + * that doesn't account for a recent write. In that situation, the + * pre-existing buffer would contain valid data that we don't want to + * overwrite. Since the legitimate cases should always have left a + * zero-filled buffer, complain if not PageIsNew. + */ + if (existing_id >= 0) + { + BufferDesc *existing_hdr = GetBufferDescriptor(existing_id); + Block buf_block; + bool valid; + + /* + * Pin the existing buffer before releasing the partition lock, + * preventing it from being evicted. + */ + valid = PinBuffer(existing_hdr, strategy); + + LWLockRelease(partition_lock); + + /* + * The victim buffer we acquired peviously is clean and unused, + * let it be found again quickly + */ + StrategyFreeBuffer(victim_buf_hdr); + UnpinBuffer(victim_buf_hdr); + + buffers[i] = BufferDescriptorGetBuffer(existing_hdr); + buf_block = BufHdrGetBlock(existing_hdr); + + if (valid && !PageIsNew((Page) buf_block)) + ereport(ERROR, + (errmsg("unexpected data beyond EOF in block %u of relation %s", + existing_hdr->tag.blockNum, relpath(eb.smgr->smgr_rlocator, fork)), + errhint("This has been seen to occur with buggy kernels; consider updating your system."))); + + /* + * We *must* do smgr[zero]extend before succeeding, else the page + * will not be reserved by the kernel, and the next P_NEW call + * will decide to return the same page. Clear the BM_VALID bit, + * do StartBufferIO() and proceed. + * + * Loop to handle the very small possibility that someone re-sets + * BM_VALID between our clearing it and StartBufferIO inspecting + * it. + */ + do + { + uint32 buf_state = LockBufHdr(existing_hdr); + + buf_state &= ~BM_VALID; + UnlockBufHdr(existing_hdr, buf_state); + } while (!StartBufferIO(existing_hdr, true)); + } + else + { + uint32 buf_state; + + buf_state = LockBufHdr(victim_buf_hdr); + + /* some sanity checks while we hold the buffer header lock */ + Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED))); + Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1); + + victim_buf_hdr->tag = tag; + + buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE; + if (eb.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM) + buf_state |= BM_PERMANENT; + + UnlockBufHdr(victim_buf_hdr, buf_state); + + LWLockRelease(partition_lock); + + /* XXX: could combine the locked operations in it with the above */ + StartBufferIO(victim_buf_hdr, true); + } + } + + /* + * Note: if smgzerorextend fails, we will end up with buffers that are + * allocated but not marked BM_VALID. The next relation extension will + * still select the same block number (because the relation didn't get any + * longer on disk) and so future attempts to extend the relation will find + * the same buffers (if they have not been recycled) but come right back + * here to try smgrzeroextend again. + * + * We don't need to set checksum for all-zero pages. + */ + smgrzeroextend(eb.smgr, fork, first_block, extend_by, false); + + /* + * Release the file-extension lock; it's now OK for someone else to extend + * the relation some more. + * + * We remove IO_IN_PROGRESS after this, as waking up waiting backends can + * take noticeable time. + */ + if (!(flags & EB_SKIP_EXTENSION_LOCK)) + UnlockRelationForExtension(eb.rel, ExclusiveLock); + + /* Set BM_VALID, terminate IO, and wake up any waiters */ + for (int i = 0; i < extend_by; i++) + { + Buffer buf = buffers[i]; + BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1); + bool lock = false; + + if (flags & EB_LOCK_FIRST && i == 0) + lock = true; + else if (flags & EB_LOCK_TARGET) + { + Assert(extend_upto != InvalidBlockNumber); + if (first_block + i + 1 == extend_upto) + lock = true; + } + + if (lock) + LWLockAcquire(BufferDescriptorGetContentLock(buf_hdr), LW_EXCLUSIVE); + + TerminateBufferIO(buf_hdr, false, BM_VALID); + } + + pgBufferUsage.shared_blks_written += extend_by; + pgstat_count_io_op_n(IOOBJECT_RELATION, io_context, IOOP_EXTEND, + extend_by); + + *extended_by = extend_by; + + return first_block; +} + +/* * MarkBufferDirty * * Marks buffer contents as dirty (actual write happens later). |