aboutsummaryrefslogtreecommitdiff
path: root/src/backend/storage/buffer/localbuf.c
diff options
context:
space:
mode:
authorAndres Freund <andres@anarazel.de>2016-04-10 20:12:32 -0700
committerAndres Freund <andres@anarazel.de>2016-04-10 20:12:32 -0700
commit48354581a49c30f5757c203415aa8412d85b0f70 (patch)
treeca509a2c196f179e97993ac89979c361c4b5f431 /src/backend/storage/buffer/localbuf.c
parentcf223c3bf5ba16232147c66b5fef4037aafe747c (diff)
downloadpostgresql-48354581a49c30f5757c203415aa8412d85b0f70.tar.gz
postgresql-48354581a49c30f5757c203415aa8412d85b0f70.zip
Allow Pin/UnpinBuffer to operate in a lockfree manner.
Pinning/Unpinning a buffer is a very frequent operation; especially in read-mostly cache resident workloads. Benchmarking shows that in various scenarios the spinlock protecting a buffer header's state becomes a significant bottleneck. The problem can be reproduced with pgbench -S on larger machines, but can be considerably worse for queries which touch the same buffers over and over at a high frequency (e.g. nested loops over a small inner table). To allow atomic operations to be used, cram BufferDesc's flags, usage_count, buf_hdr_lock, refcount into a single 32bit atomic variable; that allows to manipulate them together using 32bit compare-and-swap operations. This requires reducing MAX_BACKENDS to 2^18-1 (which could be lifted by using a 64bit field, but it's not a realistic configuration atm). As not all operations can easily implemented in a lockfree manner, implement the previous buf_hdr_lock via a flag bit in the atomic variable. That way we can continue to lock the header in places where it's needed, but can get away without acquiring it in the more frequent hot-paths. There's some additional operations which can be done without the lock, but aren't in this patch; but the most important places are covered. As bufmgr.c now essentially re-implements spinlocks, abstract the delay logic from s_lock.c into something more generic. It now has already two users, and more are coming up; there's a follupw patch for lwlock.c at least. This patch is based on a proof-of-concept written by me, which Alexander Korotkov made into a fully working patch; the committed version is again revised by me. Benchmarking and testing has, amongst others, been provided by Dilip Kumar, Alexander Korotkov, Robert Haas. On a large x86 system improvements for readonly pgbench, with a high client count, of a factor of 8 have been observed. Author: Alexander Korotkov and Andres Freund Discussion: 2400449.GjM57CE0Yg@dinodell
Diffstat (limited to 'src/backend/storage/buffer/localbuf.c')
-rw-r--r--src/backend/storage/buffer/localbuf.c64
1 files changed, 43 insertions, 21 deletions
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 17640cfe2a7..68b402023a1 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -108,6 +108,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
int b;
int trycounter;
bool found;
+ uint32 buf_state;
INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
@@ -128,16 +129,21 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n",
smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1);
#endif
+ buf_state = pg_atomic_read_u32(&bufHdr->state);
+
/* this part is equivalent to PinBuffer for a shared buffer */
if (LocalRefCount[b] == 0)
{
- if (bufHdr->usage_count < BM_MAX_USAGE_COUNT)
- bufHdr->usage_count++;
+ if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
+ {
+ buf_state += BUF_USAGECOUNT_ONE;
+ pg_atomic_write_u32(&bufHdr->state, buf_state);
+ }
}
LocalRefCount[b]++;
ResourceOwnerRememberBuffer(CurrentResourceOwner,
BufferDescriptorGetBuffer(bufHdr));
- if (bufHdr->flags & BM_VALID)
+ if (buf_state & BM_VALID)
*foundPtr = TRUE;
else
{
@@ -169,9 +175,12 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
if (LocalRefCount[b] == 0)
{
- if (bufHdr->usage_count > 0)
+ buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+ if (BUF_STATE_GET_USAGECOUNT(buf_state) > 0)
{
- bufHdr->usage_count--;
+ buf_state -= BUF_USAGECOUNT_ONE;
+ pg_atomic_write_u32(&bufHdr->state, buf_state);
trycounter = NLocBuffer;
}
else
@@ -193,7 +202,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
* this buffer is not referenced but it might still be dirty. if that's
* the case, write it out before reusing it!
*/
- if (bufHdr->flags & BM_DIRTY)
+ if (buf_state & BM_DIRTY)
{
SMgrRelation oreln;
Page localpage = (char *) LocalBufHdrGetBlock(bufHdr);
@@ -211,7 +220,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
false);
/* Mark not-dirty now in case we error out below */
- bufHdr->flags &= ~BM_DIRTY;
+ buf_state &= ~BM_DIRTY;
+ pg_atomic_write_u32(&bufHdr->state, buf_state);
pgBufferUsage.local_blks_written++;
}
@@ -228,7 +238,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
/*
* Update the hash table: remove old entry, if any, and make new one.
*/
- if (bufHdr->flags & BM_TAG_VALID)
+ if (buf_state & BM_TAG_VALID)
{
hresult = (LocalBufferLookupEnt *)
hash_search(LocalBufHash, (void *) &bufHdr->tag,
@@ -237,7 +247,8 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
elog(ERROR, "local buffer hash table corrupted");
/* mark buffer invalid just in case hash insert fails */
CLEAR_BUFFERTAG(bufHdr->tag);
- bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID);
+ buf_state &= ~(BM_VALID | BM_TAG_VALID);
+ pg_atomic_write_u32(&bufHdr->state, buf_state);
}
hresult = (LocalBufferLookupEnt *)
@@ -250,9 +261,11 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
* it's all ours now.
*/
bufHdr->tag = newTag;
- bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
- bufHdr->flags |= BM_TAG_VALID;
- bufHdr->usage_count = 1;
+ buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR);
+ buf_state |= BM_TAG_VALID;
+ buf_state &= ~BUF_USAGECOUNT_MASK;
+ buf_state += BUF_USAGECOUNT_ONE;
+ pg_atomic_write_u32(&bufHdr->state, buf_state);
*foundPtr = FALSE;
return bufHdr;
@@ -267,6 +280,7 @@ MarkLocalBufferDirty(Buffer buffer)
{
int bufid;
BufferDesc *bufHdr;
+ uint32 buf_state;
Assert(BufferIsLocal(buffer));
@@ -280,10 +294,10 @@ MarkLocalBufferDirty(Buffer buffer)
bufHdr = GetLocalBufferDescriptor(bufid);
- if (!(bufHdr->flags & BM_DIRTY))
- pgBufferUsage.local_blks_dirtied++;
+ buf_state = pg_atomic_fetch_or_u32(&bufHdr->state, BM_DIRTY);
- bufHdr->flags |= BM_DIRTY;
+ if (!(buf_state & BM_DIRTY))
+ pgBufferUsage.local_blks_dirtied++;
}
/*
@@ -307,8 +321,11 @@ DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
{
BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
LocalBufferLookupEnt *hresult;
+ uint32 buf_state;
- if ((bufHdr->flags & BM_TAG_VALID) &&
+ buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+ if ((buf_state & BM_TAG_VALID) &&
RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
bufHdr->tag.forkNum == forkNum &&
bufHdr->tag.blockNum >= firstDelBlock)
@@ -327,8 +344,9 @@ DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
elog(ERROR, "local buffer hash table corrupted");
/* Mark buffer invalid */
CLEAR_BUFFERTAG(bufHdr->tag);
- bufHdr->flags = 0;
- bufHdr->usage_count = 0;
+ buf_state &= ~BUF_FLAG_MASK;
+ buf_state &= ~BUF_USAGECOUNT_MASK;
+ pg_atomic_write_u32(&bufHdr->state, buf_state);
}
}
}
@@ -349,8 +367,11 @@ DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
{
BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
LocalBufferLookupEnt *hresult;
+ uint32 buf_state;
+
+ buf_state = pg_atomic_read_u32(&bufHdr->state);
- if ((bufHdr->flags & BM_TAG_VALID) &&
+ if ((buf_state & BM_TAG_VALID) &&
RelFileNodeEquals(bufHdr->tag.rnode, rnode))
{
if (LocalRefCount[i] != 0)
@@ -367,8 +388,9 @@ DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
elog(ERROR, "local buffer hash table corrupted");
/* Mark buffer invalid */
CLEAR_BUFFERTAG(bufHdr->tag);
- bufHdr->flags = 0;
- bufHdr->usage_count = 0;
+ buf_state &= ~BUF_FLAG_MASK;
+ buf_state &= ~BUF_USAGECOUNT_MASK;
+ pg_atomic_write_u32(&bufHdr->state, buf_state);
}
}
}