diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/storage/buffer/buf_init.c | 61 | ||||
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 57 | ||||
-rw-r--r-- | src/backend/storage/lmgr/lwlock.c | 15 | ||||
-rw-r--r-- | src/include/storage/buf_internals.h | 23 | ||||
-rw-r--r-- | src/include/storage/lwlock.h | 61 | ||||
-rw-r--r-- | src/tools/pgindent/typedefs.list | 1 |
6 files changed, 151 insertions, 67 deletions
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 3ae2848da05..2a84a1ebade 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -20,6 +20,9 @@ BufferDescPadded *BufferDescriptors; char *BufferBlocks; +LWLockMinimallyPadded *BufferIOLWLockArray = NULL; +LWLockTranche BufferIOLWLockTranche; +LWLockTranche BufferContentLWLockTranche; /* @@ -65,22 +68,45 @@ void InitBufferPool(void) { bool foundBufs, - foundDescs; + foundDescs, + foundIOLocks; /* Align descriptors to a cacheline boundary. */ - BufferDescriptors = (BufferDescPadded *) CACHELINEALIGN( - ShmemInitStruct("Buffer Descriptors", - NBuffers * sizeof(BufferDescPadded) + PG_CACHE_LINE_SIZE, - &foundDescs)); + BufferDescriptors = (BufferDescPadded *) + CACHELINEALIGN( + ShmemInitStruct("Buffer Descriptors", + NBuffers * sizeof(BufferDescPadded) + + PG_CACHE_LINE_SIZE, + &foundDescs)); BufferBlocks = (char *) ShmemInitStruct("Buffer Blocks", NBuffers * (Size) BLCKSZ, &foundBufs); - if (foundDescs || foundBufs) + /* Align lwlocks to cacheline boundary */ + BufferIOLWLockArray = (LWLockMinimallyPadded *) + CACHELINEALIGN(ShmemInitStruct("Buffer IO Locks", + NBuffers * (Size) sizeof(LWLockMinimallyPadded) + + PG_CACHE_LINE_SIZE, + &foundIOLocks)); + + BufferIOLWLockTranche.name = "Buffer IO Locks"; + BufferIOLWLockTranche.array_base = BufferIOLWLockArray; + BufferIOLWLockTranche.array_stride = sizeof(LWLockMinimallyPadded); + LWLockRegisterTranche(LWTRANCHE_BUFFER_IO_IN_PROGRESS, + &BufferIOLWLockTranche); + + BufferContentLWLockTranche.name = "Buffer Content Locks"; + BufferContentLWLockTranche.array_base = + ((char *) BufferDescriptors) + offsetof(BufferDesc, content_lock); + BufferContentLWLockTranche.array_stride = sizeof(BufferDescPadded); + LWLockRegisterTranche(LWTRANCHE_BUFFER_CONTENT, + &BufferContentLWLockTranche); + + if (foundDescs || foundBufs || foundIOLocks) { - /* both should be present or neither */ - Assert(foundDescs && foundBufs); + /* should find all of these, or none of them */ + Assert(foundDescs && foundBufs && foundIOLocks); /* note: this path is only taken in EXEC_BACKEND case */ } else @@ -110,8 +136,11 @@ InitBufferPool(void) */ buf->freeNext = i + 1; - buf->io_in_progress_lock = LWLockAssign(); - buf->content_lock = LWLockAssign(); + LWLockInitialize(BufferDescriptorGetContentLock(buf), + LWTRANCHE_BUFFER_CONTENT); + + LWLockInitialize(BufferDescriptorGetIOLock(buf), + LWTRANCHE_BUFFER_IO_IN_PROGRESS); } /* Correct last entry of linked list */ @@ -144,5 +173,17 @@ BufferShmemSize(void) /* size of stuff controlled by freelist.c */ size = add_size(size, StrategyShmemSize()); + /* + * It would be nice to include the I/O locks in the BufferDesc, but that + * would increase the size of a BufferDesc to more than one cache line, and + * benchmarking has shown that keeping every BufferDesc aligned on a cache + * line boundary is important for performance. So, instead, the array of + * I/O locks is allocated in a separate tranche. Because those locks are + * not highly contentended, we lay out the array with minimal padding. + */ + size = add_size(size, mul_size(NBuffers, sizeof(LWLockMinimallyPadded))); + /* to allow aligning the above */ + size = add_size(size, PG_CACHE_LINE_SIZE); + return size; } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 0d5fb0db88f..a1ad23ccf52 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -738,7 +738,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, if (!isLocalBuf) { if (mode == RBM_ZERO_AND_LOCK) - LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE); + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), + LW_EXCLUSIVE); else if (mode == RBM_ZERO_AND_CLEANUP_LOCK) LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr)); } @@ -879,7 +880,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) && !isLocalBuf) { - LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE); + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE); } if (isLocalBuf) @@ -1045,7 +1046,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * happens to be trying to split the page the first one got from * StrategyGetBuffer.) */ - if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED)) + if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), + LW_SHARED)) { /* * If using a nondefault strategy, and writing the buffer @@ -1067,7 +1069,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, StrategyRejectBuffer(strategy, buf)) { /* Drop lock/pin and loop around for another buffer */ - LWLockRelease(buf->content_lock); + LWLockRelease(BufferDescriptorGetContentLock(buf)); UnpinBuffer(buf, true); continue; } @@ -1080,7 +1082,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, smgr->smgr_rnode.node.relNode); FlushBuffer(buf, NULL); - LWLockRelease(buf->content_lock); + LWLockRelease(BufferDescriptorGetContentLock(buf)); TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum, smgr->smgr_rnode.node.spcNode, @@ -1395,7 +1397,7 @@ MarkBufferDirty(Buffer buffer) Assert(BufferIsPinned(buffer)); /* unfortunately we can't check if the lock is held exclusively */ - Assert(LWLockHeldByMe(bufHdr->content_lock)); + Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr))); LockBufHdr(bufHdr); @@ -1595,8 +1597,8 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner) if (ref->refcount == 0) { /* I'd better not still hold any locks on the buffer */ - Assert(!LWLockHeldByMe(buf->content_lock)); - Assert(!LWLockHeldByMe(buf->io_in_progress_lock)); + Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf))); + Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf))); LockBufHdr(buf); @@ -2116,11 +2118,11 @@ SyncOneBuffer(int buf_id, bool skip_recently_used) * buffer is clean by the time we've locked it.) */ PinBuffer_Locked(bufHdr); - LWLockAcquire(bufHdr->content_lock, LW_SHARED); + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); FlushBuffer(bufHdr, NULL); - LWLockRelease(bufHdr->content_lock); + LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); UnpinBuffer(bufHdr, true); return result | BUF_WRITTEN; @@ -2926,9 +2928,9 @@ FlushRelationBuffers(Relation rel) (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY)) { PinBuffer_Locked(bufHdr); - LWLockAcquire(bufHdr->content_lock, LW_SHARED); + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); FlushBuffer(bufHdr, rel->rd_smgr); - LWLockRelease(bufHdr->content_lock); + LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); UnpinBuffer(bufHdr, true); } else @@ -2978,9 +2980,9 @@ FlushDatabaseBuffers(Oid dbid) (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY)) { PinBuffer_Locked(bufHdr); - LWLockAcquire(bufHdr->content_lock, LW_SHARED); + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); FlushBuffer(bufHdr, NULL); - LWLockRelease(bufHdr->content_lock); + LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); UnpinBuffer(bufHdr, true); } else @@ -3004,7 +3006,7 @@ FlushOneBuffer(Buffer buffer) bufHdr = GetBufferDescriptor(buffer - 1); - Assert(LWLockHeldByMe(bufHdr->content_lock)); + Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr))); FlushBuffer(bufHdr, NULL); } @@ -3101,7 +3103,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) Assert(GetPrivateRefCount(buffer) > 0); /* here, either share or exclusive lock is OK */ - Assert(LWLockHeldByMe(bufHdr->content_lock)); + Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr))); /* * This routine might get called many times on the same page, if we are @@ -3254,11 +3256,11 @@ LockBuffer(Buffer buffer, int mode) buf = GetBufferDescriptor(buffer - 1); if (mode == BUFFER_LOCK_UNLOCK) - LWLockRelease(buf->content_lock); + LWLockRelease(BufferDescriptorGetContentLock(buf)); else if (mode == BUFFER_LOCK_SHARE) - LWLockAcquire(buf->content_lock, LW_SHARED); + LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED); else if (mode == BUFFER_LOCK_EXCLUSIVE) - LWLockAcquire(buf->content_lock, LW_EXCLUSIVE); + LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE); else elog(ERROR, "unrecognized buffer lock mode: %d", mode); } @@ -3279,7 +3281,8 @@ ConditionalLockBuffer(Buffer buffer) buf = GetBufferDescriptor(buffer - 1); - return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE); + return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), + LW_EXCLUSIVE); } /* @@ -3489,8 +3492,8 @@ WaitIO(BufferDesc *buf) UnlockBufHdr(buf); if (!(sv_flags & BM_IO_IN_PROGRESS)) break; - LWLockAcquire(buf->io_in_progress_lock, LW_SHARED); - LWLockRelease(buf->io_in_progress_lock); + LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_SHARED); + LWLockRelease(BufferDescriptorGetIOLock(buf)); } } @@ -3523,7 +3526,7 @@ StartBufferIO(BufferDesc *buf, bool forInput) * Grab the io_in_progress lock so that other processes can wait for * me to finish the I/O. */ - LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE); + LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE); LockBufHdr(buf); @@ -3537,7 +3540,7 @@ StartBufferIO(BufferDesc *buf, bool forInput) * him to get unwedged. */ UnlockBufHdr(buf); - LWLockRelease(buf->io_in_progress_lock); + LWLockRelease(BufferDescriptorGetIOLock(buf)); WaitIO(buf); } @@ -3547,7 +3550,7 @@ StartBufferIO(BufferDesc *buf, bool forInput) { /* someone else already did the I/O */ UnlockBufHdr(buf); - LWLockRelease(buf->io_in_progress_lock); + LWLockRelease(BufferDescriptorGetIOLock(buf)); return false; } @@ -3595,7 +3598,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, int set_flag_bits) InProgressBuf = NULL; - LWLockRelease(buf->io_in_progress_lock); + LWLockRelease(BufferDescriptorGetIOLock(buf)); } /* @@ -3620,7 +3623,7 @@ AbortBufferIO(void) * we can use TerminateBufferIO. Anyone who's executing WaitIO on the * buffer will be in a busy spin until we succeed in doing this. */ - LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE); + LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE); LockBufHdr(buf); Assert(buf->flags & BM_IO_IN_PROGRESS); diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 84691df053b..d43fb61edb3 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -344,18 +344,15 @@ NumLWLocks(void) int numLocks; /* - * Possibly this logic should be spread out among the affected modules, - * the same way that shmem space estimation is done. But for now, there - * are few enough users of LWLocks that we can get away with just keeping - * the knowledge here. + * Many users of LWLocks no longer reserve space in the main array here, + * but instead allocate separate tranches. The latter approach has the + * advantage of allowing LWLOCK_STATS and LOCK_DEBUG output to produce + * more useful output. */ /* Predefined LWLocks */ numLocks = NUM_FIXED_LWLOCKS; - /* bufmgr.c needs two for each shared buffer */ - numLocks += 2 * NBuffers; - /* proc.c needs one for each backend or auxiliary process */ numLocks += MaxBackends + NUM_AUXILIARY_PROCS; @@ -423,6 +420,10 @@ CreateLWLocks(void) StaticAssertExpr(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS, "MAX_BACKENDS too big for lwlock.c"); + StaticAssertExpr(sizeof(LWLock) <= LWLOCK_MINIMAL_SIZE && + sizeof(LWLock) <= LWLOCK_PADDED_SIZE, + "Miscalculated LWLock padding"); + if (!IsUnderPostmaster) { int numLocks = NumLWLocks(); diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 19247c4b2be..81aaf47da58 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -115,8 +115,8 @@ typedef struct buftag * Note: buf_hdr_lock must be held to examine or change the tag, flags, * usage_count, refcount, or wait_backend_pid fields. buf_id field never * changes after initialization, so does not need locking. freeNext is - * protected by the buffer_strategy_lock not buf_hdr_lock. The LWLocks can take - * care of themselves. The buf_hdr_lock is *not* used to control access to + * protected by the buffer_strategy_lock not buf_hdr_lock. The LWLock can + * take care of itself. The buf_hdr_lock is *not* used to control access to * the data in the buffer! * * An exception is that if we have the buffer pinned, its tag can't change @@ -133,22 +133,24 @@ typedef struct buftag * * We use this same struct for local buffer headers, but the lock fields * are not used and not all of the flag bits are useful either. + * + * Be careful to avoid increasing the size of the struct when adding or + * reordering members. Keeping it below 64 bytes (the most common CPU + * cache line size) is fairly important for performance. */ typedef struct BufferDesc { BufferTag tag; /* ID of page contained in buffer */ BufFlags flags; /* see bit definitions above */ - uint16 usage_count; /* usage counter for clock sweep code */ + uint8 usage_count; /* usage counter for clock sweep code */ + slock_t buf_hdr_lock; /* protects a subset of fields, see above */ unsigned refcount; /* # of backends holding pins on buffer */ int wait_backend_pid; /* backend PID of pin-count waiter */ - slock_t buf_hdr_lock; /* protects the above fields */ - int buf_id; /* buffer's index number (from 0) */ int freeNext; /* link in freelist chain */ - LWLock *io_in_progress_lock; /* to wait for I/O to complete */ - LWLock *content_lock; /* to lock access to buffer contents */ + LWLock content_lock; /* to lock access to buffer contents */ } BufferDesc; /* @@ -184,6 +186,13 @@ typedef union BufferDescPadded #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1) +#define BufferDescriptorGetIOLock(bdesc) \ + (&(BufferIOLWLockArray[(bdesc)->buf_id]).lock) +#define BufferDescriptorGetContentLock(bdesc) \ + ((LWLock*) (&(bdesc)->content_lock)) + +extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray; + /* * The freeNext field is either the index of the next freelist entry, * or one of these special values: diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index ed9025babd1..ff34529ff02 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -25,6 +25,12 @@ struct PGPROC; /* + * Prior to PostgreSQL 9.4, every lightweight lock in the system was stored + * in a single array. For convenience and for compatibility with past + * releases, we still have a main array, but it's now also permissible to + * store LWLocks elsewhere in the main shared memory segment or in a dynamic + * shared memory segment. Each array of lwlocks forms a separate "tranche". + * * It's occasionally necessary to identify a particular LWLock "by name"; e.g. * because we wish to report the lock to dtrace. We could store a name or * other identifying information in the lock itself, but since it's common @@ -65,30 +71,51 @@ typedef struct LWLock } LWLock; /* - * Prior to PostgreSQL 9.4, every lightweight lock in the system was stored - * in a single array. For convenience and for compatibility with past - * releases, we still have a main array, but it's now also permissible to - * store LWLocks elsewhere in the main shared memory segment or in a dynamic - * shared memory segment. In the main array, we force the array stride to - * be a power of 2, which saves a few cycles in indexing, but more importantly - * also ensures that individual LWLocks don't cross cache line boundaries. - * This reduces cache contention problems, especially on AMD Opterons. - * (Of course, we have to also ensure that the array start address is suitably - * aligned.) + * In most cases, it's desirable to force each tranche of LWLocks to be aligned + * on a cache line boundary and make the array stride a power of 2. This saves + * a few cycles in indexing, but more importantly ensures that individual + * LWLocks don't cross cache line boundaries. This reduces cache contention + * problems, especially on AMD Opterons. In some cases, it's useful to add + * even more padding so that each LWLock takes up an entire cache line; this is + * useful, for example, in the main LWLock array, where the overall number of + * locks is small but some are heavily contended. + * + * When allocating a tranche that contains data other than LWLocks, it is + * probably best to include a bare LWLock and then pad the resulting structure + * as necessary for performance. For an array that contains only LWLocks, + * LWLockMinimallyPadded can be used for cases where we just want to ensure + * that we don't cross cache line boundaries within a single lock, while + * LWLockPadded can be used for cases where we want each lock to be an entire + * cache line. * - * On a 32-bit platforms a LWLock will these days fit into 16 bytes, but since - * that didn't use to be the case and cramming more lwlocks into a cacheline - * might be detrimental performancewise we still use 32 byte alignment - * there. So, both on 32 and 64 bit platforms, it should fit into 32 bytes - * unless slock_t is really big. We allow for that just in case. + * On 32-bit platforms, an LWLockMinimallyPadded might actually contain more + * than the absolute minimum amount of padding required to keep a lock from + * crossing a cache line boundary, because an unpadded LWLock might fit into + * 16 bytes. We ignore that possibility when determining the minimal amount + * of padding. Older releases had larger LWLocks, so 32 really was the + * minimum, and packing them in tighter might hurt performance. + * + * LWLOCK_MINIMAL_SIZE should be 32 on basically all common platforms, but + * because slock_t is more than 2 bytes on some obscure platforms, we allow + * for the possibility that it might be 64. */ -#define LWLOCK_PADDED_SIZE (sizeof(LWLock) <= 32 ? 32 : 64) +#define LWLOCK_PADDED_SIZE PG_CACHE_LINE_SIZE +#define LWLOCK_MINIMAL_SIZE (sizeof(LWLock) <= 32 ? 32 : 64) +/* LWLock, padded to a full cache line size */ typedef union LWLockPadded { LWLock lock; char pad[LWLOCK_PADDED_SIZE]; } LWLockPadded; + +/* LWLock, minimally padded */ +typedef union LWLockMinimallyPadded +{ + LWLock lock; + char pad[LWLOCK_MINIMAL_SIZE]; +} LWLockMinimallyPadded; + extern PGDLLIMPORT LWLockPadded *MainLWLockArray; extern char *MainLWLockNames[]; @@ -184,6 +211,8 @@ typedef enum BuiltinTrancheIds { LWTRANCHE_MAIN, LWTRANCHE_WAL_INSERT, + LWTRANCHE_BUFFER_CONTENT, + LWTRANCHE_BUFFER_IO_IN_PROGRESS, LWTRANCHE_FIRST_USER_DEFINED } BuiltinTrancheIds; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 03e1d2c1c76..963d8651141 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1008,6 +1008,7 @@ LSEG LVRelStats LWLock LWLockHandle +LWLockMinimallyPadded LWLockMode LWLockPadded LWLockTranche |