diff options
Diffstat (limited to 'src/include/storage/buf_internals.h')
-rw-r--r-- | src/include/storage/buf_internals.h | 101 |
1 files changed, 69 insertions, 32 deletions
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index d04363b1665..f8f71255fdd 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -21,29 +21,51 @@ #include "storage/lwlock.h" #include "storage/shmem.h" #include "storage/smgr.h" +#include "port/atomics.h" #include "storage/spin.h" #include "utils/relcache.h" /* + * Buffer state is a single 32-bit variable where following data is combined. + * + * - 18 bits refcount + * - 4 bits usage count + * - 10 bits of flags + * + * Combining these values allows to perform some operations without locking + * the buffer header, by modifying them together with a CAS loop. + * + * The definition of buffer state components is below. + */ +#define BUF_REFCOUNT_ONE 1 +#define BUF_REFCOUNT_MASK ((1U << 18) - 1) +#define BUF_USAGECOUNT_MASK 0x003C0000U +#define BUF_USAGECOUNT_ONE (1U << 18) +#define BUF_USAGECOUNT_SHIFT 18 +#define BUF_FLAG_MASK 0xFFC00000U + +/* Get refcount and usagecount from buffer state */ +#define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK) +#define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT) + +/* * Flags for buffer descriptors * * Note: TAG_VALID essentially means that there is a buffer hashtable * entry associated with the buffer's tag. */ -#define BM_DIRTY (1 << 0) /* data needs writing */ -#define BM_VALID (1 << 1) /* data is valid */ -#define BM_TAG_VALID (1 << 2) /* tag is assigned */ -#define BM_IO_IN_PROGRESS (1 << 3) /* read or write in progress */ -#define BM_IO_ERROR (1 << 4) /* previous I/O failed */ -#define BM_JUST_DIRTIED (1 << 5) /* dirtied since write started */ -#define BM_PIN_COUNT_WAITER (1 << 6) /* have waiter for sole pin */ -#define BM_CHECKPOINT_NEEDED (1 << 7) /* must write for checkpoint */ -#define BM_PERMANENT (1 << 8) /* permanent relation (not +#define BM_LOCKED (1U << 22) /* buffer header is locked */ +#define BM_DIRTY (1U << 23) /* data needs writing */ +#define BM_VALID (1U << 24) /* data is valid */ +#define BM_TAG_VALID (1U << 25) /* tag is assigned */ +#define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */ +#define BM_IO_ERROR (1U << 27) /* previous I/O failed */ +#define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */ +#define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */ +#define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */ +#define BM_PERMANENT (1U << 31) /* permanent relation (not * unlogged) */ - -typedef bits16 BufFlags; - /* * The maximum allowed value of usage_count represents a tradeoff between * accuracy and speed of the clock-sweep buffer management algorithm. A @@ -113,18 +135,29 @@ typedef struct buftag /* * BufferDesc -- shared descriptor/state data for a single shared buffer. * - * Note: buf_hdr_lock must be held to examine or change the tag, flags, - * usage_count, refcount, or wait_backend_pid fields. buf_id field never - * changes after initialization, so does not need locking. freeNext is - * protected by the buffer_strategy_lock not buf_hdr_lock. The LWLock can - * take care of itself. The buf_hdr_lock is *not* used to control access to - * the data in the buffer! + * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change + * the tag, state or wait_backend_pid fields. In general, buffer header lock + * is a spinlock which is combined with flags, refcount and usagecount into + * single atomic variable. This layout allow us to do some operations in a + * single atomic operation, without actually acquiring and releasing spinlock; + * for instance, increase or decrease refcount. buf_id field never changes + * after initialization, so does not need locking. freeNext is protected by + * the buffer_strategy_lock not buffer header lock. The LWLock can take care + * of itself. The buffer header lock is *not* used to control access to the + * data in the buffer! + * + * It's assumed that nobody changes the state field while buffer header lock + * is held. Thus buffer header lock holder can do complex updates of the + * state variable in single write, simultaneously with lock release (cleaning + * BM_LOCKED flag). On the other hand, updating of state without holding + * buffer header lock is restricted to CAS, which insure that BM_LOCKED flag + * is not set. Atomic increment/decrement, OR/AND etc. are not allowed. * * An exception is that if we have the buffer pinned, its tag can't change - * underneath us, so we can examine the tag without locking the spinlock. + * underneath us, so we can examine the tag without locking the buffer header. * Also, in places we do one-time reads of the flags without bothering to - * lock the spinlock; this is generally for situations where we don't expect - * the flag bit being tested to be changing. + * lock the buffer header; this is generally for situations where we don't + * expect the flag bit being tested to be changing. * * We can't physically remove items from a disk page if another backend has * the buffer pinned. Hence, a backend may need to wait for all other pins @@ -142,13 +175,12 @@ typedef struct buftag typedef struct BufferDesc { BufferTag tag; /* ID of page contained in buffer */ - BufFlags flags; /* see bit definitions above */ - uint8 usage_count; /* usage counter for clock sweep code */ - slock_t buf_hdr_lock; /* protects a subset of fields, see above */ - unsigned refcount; /* # of backends holding pins on buffer */ - int wait_backend_pid; /* backend PID of pin-count waiter */ - int buf_id; /* buffer's index number (from 0) */ + + /* state of the tag, containing flags, refcount and usagecount */ + pg_atomic_uint32 state; + + int wait_backend_pid; /* backend PID of pin-count waiter */ int freeNext; /* link in freelist chain */ LWLock content_lock; /* to lock access to buffer contents */ @@ -202,11 +234,15 @@ extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray; #define FREENEXT_NOT_IN_LIST (-2) /* - * Macros for acquiring/releasing a shared buffer header's spinlock. - * Do not apply these to local buffers! + * Functions for acquiring/releasing a shared buffer header's spinlock. Do + * not apply these to local buffers! */ -#define LockBufHdr(bufHdr) SpinLockAcquire(&(bufHdr)->buf_hdr_lock) -#define UnlockBufHdr(bufHdr) SpinLockRelease(&(bufHdr)->buf_hdr_lock) +extern uint32 LockBufHdr(BufferDesc *desc); +#define UnlockBufHdr(desc, s) \ + do { \ + pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \ + pg_write_barrier(); \ + } while (0) /* @@ -267,7 +303,8 @@ extern void IssuePendingWritebacks(WritebackContext *context); extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag); /* freelist.c */ -extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy); +extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy, + uint32 *buf_state); extern void StrategyFreeBuffer(BufferDesc *buf); extern bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf); |