aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/nbtree/nbtinsert.c2
-rw-r--r--src/backend/access/nbtree/nbtpage.c145
-rw-r--r--src/backend/access/nbtree/nbtree.c5
-rw-r--r--src/backend/access/nbtree/nbtsearch.c23
-rw-r--r--src/backend/access/nbtree/nbtutils.c4
-rw-r--r--src/backend/storage/buffer/bufmgr.c23
-rw-r--r--src/include/access/nbtree.h4
-rw-r--r--src/include/pg_config_manual.h8
8 files changed, 170 insertions, 44 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index b86c122763e..e3a44bc09e0 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -303,7 +303,7 @@ _bt_search_insert(Relation rel, BTInsertState insertstate)
{
/* Simulate a _bt_getbuf() call with conditional locking */
insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
- if (ConditionalLockBuffer(insertstate->buf))
+ if (_bt_conditionallockbuf(rel, insertstate->buf))
{
Page page;
BTPageOpaque lpageop;
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 75628e0eb98..70bac0052fc 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -32,6 +32,7 @@
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
+#include "utils/memdebug.h"
#include "utils/snapmgr.h"
static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
@@ -198,8 +199,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
}
/* trade in our read lock for a write lock */
- LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
- LockBuffer(metabuf, BT_WRITE);
+ _bt_unlockbuf(rel, metabuf);
+ _bt_lockbuf(rel, metabuf, BT_WRITE);
START_CRIT_SECTION();
@@ -340,8 +341,8 @@ _bt_getroot(Relation rel, int access)
}
/* trade in our read lock for a write lock */
- LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
- LockBuffer(metabuf, BT_WRITE);
+ _bt_unlockbuf(rel, metabuf);
+ _bt_lockbuf(rel, metabuf, BT_WRITE);
/*
* Race condition: if someone else initialized the metadata between
@@ -434,8 +435,8 @@ _bt_getroot(Relation rel, int access)
* else accessing the new root page while it's unlocked, since no one
* else knows where it is yet.
*/
- LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
- LockBuffer(rootbuf, BT_READ);
+ _bt_unlockbuf(rel, rootbuf);
+ _bt_lockbuf(rel, rootbuf, BT_READ);
/* okay, metadata is correct, release lock on it without caching */
_bt_relbuf(rel, metabuf);
@@ -783,10 +784,20 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX
* blkno == P_NEW means to get an unallocated index page. The page
* will be initialized before returning it.
*
+ * The general rule in nbtree is that it's never okay to access a
+ * page without holding both a buffer pin and a buffer lock on
+ * the page's buffer.
+ *
* When this routine returns, the appropriate lock is set on the
* requested buffer and its reference count has been incremented
* (ie, the buffer is "locked and pinned"). Also, we apply
- * _bt_checkpage to sanity-check the page (except in P_NEW case).
+ * _bt_checkpage to sanity-check the page (except in P_NEW case),
+ * and perform Valgrind client requests that help Valgrind detect
+ * unsafe page accesses.
+ *
+ * Note: raw LockBuffer() calls are disallowed in nbtree; all
+ * buffer lock requests need to go through wrapper functions such
+ * as _bt_lockbuf().
*/
Buffer
_bt_getbuf(Relation rel, BlockNumber blkno, int access)
@@ -797,7 +808,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
{
/* Read an existing block of the relation */
buf = ReadBuffer(rel, blkno);
- LockBuffer(buf, access);
+ _bt_lockbuf(rel, buf, access);
_bt_checkpage(rel, buf);
}
else
@@ -837,7 +848,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
if (blkno == InvalidBlockNumber)
break;
buf = ReadBuffer(rel, blkno);
- if (ConditionalLockBuffer(buf))
+ if (_bt_conditionallockbuf(rel, buf))
{
page = BufferGetPage(buf);
if (_bt_page_recyclable(page))
@@ -890,7 +901,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
buf = ReadBuffer(rel, P_NEW);
/* Acquire buffer lock on new page */
- LockBuffer(buf, BT_WRITE);
+ _bt_lockbuf(rel, buf, BT_WRITE);
/*
* Release the file-extension lock; it's now OK for someone else to
@@ -931,9 +942,10 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
Assert(blkno != P_NEW);
if (BufferIsValid(obuf))
- LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
+ _bt_unlockbuf(rel, obuf);
buf = ReleaseAndReadBuffer(obuf, rel, blkno);
- LockBuffer(buf, access);
+ _bt_lockbuf(rel, buf, access);
+
_bt_checkpage(rel, buf);
return buf;
}
@@ -946,7 +958,102 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
void
_bt_relbuf(Relation rel, Buffer buf)
{
- UnlockReleaseBuffer(buf);
+ _bt_unlockbuf(rel, buf);
+ ReleaseBuffer(buf);
+}
+
+/*
+ * _bt_lockbuf() -- lock a pinned buffer.
+ *
+ * Lock is acquired without acquiring another pin. This is like a raw
+ * LockBuffer() call, but performs extra steps needed by Valgrind.
+ *
+ * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
+ * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
+ */
+void
+_bt_lockbuf(Relation rel, Buffer buf, int access)
+{
+ /* LockBuffer() asserts that pin is held by this backend */
+ LockBuffer(buf, access);
+
+ /*
+ * It doesn't matter that _bt_unlockbuf() won't get called in the
+ * event of an nbtree error (e.g. a unique violation error). That
+ * won't cause Valgrind false positives.
+ *
+ * The nbtree client requests are superimposed on top of the
+ * bufmgr.c buffer pin client requests. In the event of an nbtree
+ * error the buffer will certainly get marked as defined when the
+ * backend once again acquires its first pin on the buffer. (Of
+ * course, if the backend never touches the buffer again then it
+ * doesn't matter that it remains non-accessible to Valgrind.)
+ *
+ * Note: When an IndexTuple C pointer gets computed using an
+ * ItemId read from a page while a lock was held, the C pointer
+ * becomes unsafe to dereference forever as soon as the lock is
+ * released. Valgrind can only detect cases where the pointer
+ * gets dereferenced with no _current_ lock/pin held, though.
+ */
+ if (!RelationUsesLocalBuffers(rel))
+ VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
+}
+
+/*
+ * _bt_unlockbuf() -- unlock a pinned buffer.
+ */
+void
+_bt_unlockbuf(Relation rel, Buffer buf)
+{
+ /*
+ * Buffer is pinned and locked, which means that it is expected to be
+ * defined and addressable. Check that proactively.
+ */
+ VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+ /* LockBuffer() asserts that pin is held by this backend */
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+ if (!RelationUsesLocalBuffers(rel))
+ VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
+}
+
+/*
+ * _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
+ * buffer.
+ *
+ * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
+ * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
+ */
+bool
+_bt_conditionallockbuf(Relation rel, Buffer buf)
+{
+ /* ConditionalLockBuffer() asserts that pin is held by this backend */
+ if (!ConditionalLockBuffer(buf))
+ return false;
+
+ if (!RelationUsesLocalBuffers(rel))
+ VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+ return true;
+}
+
+/*
+ * _bt_upgradelockbufcleanup() -- upgrade lock to super-exclusive/cleanup
+ * lock.
+ */
+void
+_bt_upgradelockbufcleanup(Relation rel, Buffer buf)
+{
+ /*
+ * Buffer is pinned and locked, which means that it is expected to be
+ * defined and addressable. Check that proactively.
+ */
+ VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+ /* LockBuffer() asserts that pin is held by this backend */
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ LockBufferForCleanup(buf);
}
/*
@@ -1580,7 +1687,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
* To avoid deadlocks, we'd better drop the leaf page lock
* before going further.
*/
- LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
+ _bt_unlockbuf(rel, leafbuf);
/*
* Check that the left sibling of leafbuf (if any) is not
@@ -1616,7 +1723,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
* (Page deletion can cope with the stack being to the left of
* leafbuf, but not to the right of leafbuf.)
*/
- LockBuffer(leafbuf, BT_WRITE);
+ _bt_lockbuf(rel, leafbuf, BT_WRITE);
continue;
}
@@ -1970,7 +2077,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
leafleftsib = opaque->btpo_prev;
leafrightsib = opaque->btpo_next;
- LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
+ _bt_unlockbuf(rel, leafbuf);
/*
* Check here, as calling loops will have locks held, preventing
@@ -2005,7 +2112,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
* To avoid deadlocks, we'd better drop the target page lock before
* going further.
*/
- LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ _bt_unlockbuf(rel, buf);
}
/*
@@ -2022,7 +2129,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
* table.)
*/
if (target != leafblkno)
- LockBuffer(leafbuf, BT_WRITE);
+ _bt_lockbuf(rel, leafbuf, BT_WRITE);
if (leftsib != P_NONE)
{
lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
@@ -2072,7 +2179,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
* rather than a superexclusive lock, since no scan will stop on an empty
* page.
*/
- LockBuffer(buf, BT_WRITE);
+ _bt_lockbuf(rel, buf, BT_WRITE);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index e947addef6b..d65f4357cc8 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -1115,7 +1115,7 @@ backtrack:
*/
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
info->strategy);
- LockBuffer(buf, BT_READ);
+ _bt_lockbuf(rel, buf, BT_READ);
page = BufferGetPage(buf);
opaque = NULL;
if (!PageIsNew(page))
@@ -1222,8 +1222,7 @@ backtrack:
* course of the vacuum scan, whether or not it actually contains any
* deletable tuples --- see nbtree/README.
*/
- LockBuffer(buf, BUFFER_LOCK_UNLOCK);
- LockBufferForCleanup(buf);
+ _bt_upgradelockbufcleanup(rel, buf);
/*
* Check whether we need to backtrack to earlier pages. What we are
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index f228c87a2b7..28dc196b55e 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -64,7 +64,7 @@ static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
static void
_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
{
- LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
+ _bt_unlockbuf(scan->indexRelation, sp->buf);
if (IsMVCCSnapshot(scan->xs_snapshot) &&
RelationNeedsWAL(scan->indexRelation) &&
@@ -187,14 +187,13 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
if (access == BT_WRITE && page_access == BT_READ)
{
/* trade in our read lock for a write lock */
- LockBuffer(*bufP, BUFFER_LOCK_UNLOCK);
- LockBuffer(*bufP, BT_WRITE);
+ _bt_unlockbuf(rel, *bufP);
+ _bt_lockbuf(rel, *bufP, BT_WRITE);
/*
- * If the page was split between the time that we surrendered our read
- * lock and acquired our write lock, then this page may no longer be
- * the right place for the key we want to insert. In this case, we
- * need to move right in the tree.
+ * Race -- the leaf page may have split after we dropped the read lock
+ * but before we acquired a write lock. If it has, we may need to
+ * move right to its new sibling. Do that.
*/
*bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
snapshot);
@@ -289,8 +288,8 @@ _bt_moveright(Relation rel,
/* upgrade our lock if necessary */
if (access == BT_READ)
{
- LockBuffer(buf, BUFFER_LOCK_UNLOCK);
- LockBuffer(buf, BT_WRITE);
+ _bt_unlockbuf(rel, buf);
+ _bt_lockbuf(rel, buf, BT_WRITE);
}
if (P_INCOMPLETE_SPLIT(opaque))
@@ -1413,7 +1412,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
* There's no actually-matching data on this page. Try to advance to
* the next page. Return false if there's no matching data at all.
*/
- LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+ _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
if (!_bt_steppage(scan, dir))
return false;
}
@@ -2061,7 +2060,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
* deleted.
*/
if (BTScanPosIsPinned(so->currPos))
- LockBuffer(so->currPos.buf, BT_READ);
+ _bt_lockbuf(rel, so->currPos.buf, BT_READ);
else
so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
@@ -2439,7 +2438,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
* There's no actually-matching data on this page. Try to advance to
* the next page. Return false if there's no matching data at all.
*/
- LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+ _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
if (!_bt_steppage(scan, dir))
return false;
}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 7c33711a9f3..81589b9056d 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1744,7 +1744,7 @@ _bt_killitems(IndexScanDesc scan)
* LSN.
*/
droppedpin = false;
- LockBuffer(so->currPos.buf, BT_READ);
+ _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
page = BufferGetPage(so->currPos.buf);
}
@@ -1885,7 +1885,7 @@ _bt_killitems(IndexScanDesc scan)
MarkBufferDirtyHint(so->currPos.buf, true);
}
- LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+ _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 9b9303ff650..f1ae6f9f844 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1639,8 +1639,8 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
* Assume that we acquired a buffer pin for the purposes of
* Valgrind buffer client checks (even in !result case) to
* keep things simple. Buffers that are unsafe to access are
- * not generally guaranteed to be marked undefined in any
- * case.
+ * not generally guaranteed to be marked undefined or
+ * non-accessible in any case.
*/
VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
break;
@@ -1649,7 +1649,16 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
}
else
{
- /* If we previously pinned the buffer, it must surely be valid */
+ /*
+ * If we previously pinned the buffer, it must surely be valid.
+ *
+ * Note: We deliberately avoid a Valgrind client request here.
+ * Individual access methods can optionally superimpose buffer page
+ * client requests on top of our client requests to enforce that
+ * buffers are only accessed while locked (and pinned). It's possible
+ * that the buffer page is legitimately non-accessible here. We
+ * cannot meddle with that.
+ */
result = true;
}
@@ -1745,7 +1754,13 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
uint32 buf_state;
uint32 old_buf_state;
- /* Mark undefined, now that no pins remain in backend */
+ /*
+ * Mark buffer non-accessible to Valgrind.
+ *
+ * Note that the buffer may have already been marked non-accessible
+ * within access method code that enforces that buffers are only
+ * accessed while a buffer lock is held.
+ */
VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
/* I'd better not still hold any locks on the buffer */
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 79506c748b2..f5274cc7508 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1073,6 +1073,10 @@ extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
BlockNumber blkno, int access);
extern void _bt_relbuf(Relation rel, Buffer buf);
+extern void _bt_lockbuf(Relation rel, Buffer buf, int access);
+extern void _bt_unlockbuf(Relation rel, Buffer buf);
+extern bool _bt_conditionallockbuf(Relation rel, Buffer buf);
+extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf);
extern void _bt_pageinit(Page page, Size size);
extern bool _bt_page_recyclable(Page page);
extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index 45b6a457896..705dc69c06a 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -271,11 +271,13 @@
* Valgrind understands PostgreSQL memory contexts. This permits detecting
* memory errors that Valgrind would not detect on a vanilla build. It also
* enables detection of buffer accesses that take place without holding a
- * buffer pin. See also src/tools/valgrind.supp.
+ * buffer pin (or without holding a buffer lock in the case of index access
+ * methods that superimpose their own custom client requests on top of the
+ * generic bufmgr.c requests). See also src/tools/valgrind.supp.
*
* "make installcheck" is significantly slower under Valgrind. The client
- * requests fall in hot code paths, so USE_VALGRIND slows native execution by
- * a few percentage points even when not run under Valgrind.
+ * requests fall in hot code paths, so USE_VALGRIND slows execution by a few
+ * percentage points even when not run under Valgrind.
*
* You should normally use MEMORY_CONTEXT_CHECKING with USE_VALGRIND;
* instrumentation of repalloc() is inferior without it.