diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/access/nbtree/nbtinsert.c | 2 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtpage.c | 145 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtree.c | 5 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtsearch.c | 23 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtutils.c | 4 | ||||
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 23 | ||||
-rw-r--r-- | src/include/access/nbtree.h | 4 | ||||
-rw-r--r-- | src/include/pg_config_manual.h | 8 |
8 files changed, 170 insertions, 44 deletions
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index b86c122763e..e3a44bc09e0 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -303,7 +303,7 @@ _bt_search_insert(Relation rel, BTInsertState insertstate) { /* Simulate a _bt_getbuf() call with conditional locking */ insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel)); - if (ConditionalLockBuffer(insertstate->buf)) + if (_bt_conditionallockbuf(rel, insertstate->buf)) { Page page; BTPageOpaque lpageop; diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 75628e0eb98..70bac0052fc 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -32,6 +32,7 @@ #include "storage/indexfsm.h" #include "storage/lmgr.h" #include "storage/predicate.h" +#include "utils/memdebug.h" #include "utils/snapmgr.h" static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf); @@ -198,8 +199,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact, } /* trade in our read lock for a write lock */ - LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); - LockBuffer(metabuf, BT_WRITE); + _bt_unlockbuf(rel, metabuf); + _bt_lockbuf(rel, metabuf, BT_WRITE); START_CRIT_SECTION(); @@ -340,8 +341,8 @@ _bt_getroot(Relation rel, int access) } /* trade in our read lock for a write lock */ - LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); - LockBuffer(metabuf, BT_WRITE); + _bt_unlockbuf(rel, metabuf); + _bt_lockbuf(rel, metabuf, BT_WRITE); /* * Race condition: if someone else initialized the metadata between @@ -434,8 +435,8 @@ _bt_getroot(Relation rel, int access) * else accessing the new root page while it's unlocked, since no one * else knows where it is yet. */ - LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK); - LockBuffer(rootbuf, BT_READ); + _bt_unlockbuf(rel, rootbuf); + _bt_lockbuf(rel, rootbuf, BT_READ); /* okay, metadata is correct, release lock on it without caching */ _bt_relbuf(rel, metabuf); @@ -783,10 +784,20 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX * blkno == P_NEW means to get an unallocated index page. The page * will be initialized before returning it. * + * The general rule in nbtree is that it's never okay to access a + * page without holding both a buffer pin and a buffer lock on + * the page's buffer. + * * When this routine returns, the appropriate lock is set on the * requested buffer and its reference count has been incremented * (ie, the buffer is "locked and pinned"). Also, we apply - * _bt_checkpage to sanity-check the page (except in P_NEW case). + * _bt_checkpage to sanity-check the page (except in P_NEW case), + * and perform Valgrind client requests that help Valgrind detect + * unsafe page accesses. + * + * Note: raw LockBuffer() calls are disallowed in nbtree; all + * buffer lock requests need to go through wrapper functions such + * as _bt_lockbuf(). */ Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access) @@ -797,7 +808,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) { /* Read an existing block of the relation */ buf = ReadBuffer(rel, blkno); - LockBuffer(buf, access); + _bt_lockbuf(rel, buf, access); _bt_checkpage(rel, buf); } else @@ -837,7 +848,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) if (blkno == InvalidBlockNumber) break; buf = ReadBuffer(rel, blkno); - if (ConditionalLockBuffer(buf)) + if (_bt_conditionallockbuf(rel, buf)) { page = BufferGetPage(buf); if (_bt_page_recyclable(page)) @@ -890,7 +901,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) buf = ReadBuffer(rel, P_NEW); /* Acquire buffer lock on new page */ - LockBuffer(buf, BT_WRITE); + _bt_lockbuf(rel, buf, BT_WRITE); /* * Release the file-extension lock; it's now OK for someone else to @@ -931,9 +942,10 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access) Assert(blkno != P_NEW); if (BufferIsValid(obuf)) - LockBuffer(obuf, BUFFER_LOCK_UNLOCK); + _bt_unlockbuf(rel, obuf); buf = ReleaseAndReadBuffer(obuf, rel, blkno); - LockBuffer(buf, access); + _bt_lockbuf(rel, buf, access); + _bt_checkpage(rel, buf); return buf; } @@ -946,7 +958,102 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access) void _bt_relbuf(Relation rel, Buffer buf) { - UnlockReleaseBuffer(buf); + _bt_unlockbuf(rel, buf); + ReleaseBuffer(buf); +} + +/* + * _bt_lockbuf() -- lock a pinned buffer. + * + * Lock is acquired without acquiring another pin. This is like a raw + * LockBuffer() call, but performs extra steps needed by Valgrind. + * + * Note: Caller may need to call _bt_checkpage() with buf when pin on buf + * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf(). + */ +void +_bt_lockbuf(Relation rel, Buffer buf, int access) +{ + /* LockBuffer() asserts that pin is held by this backend */ + LockBuffer(buf, access); + + /* + * It doesn't matter that _bt_unlockbuf() won't get called in the + * event of an nbtree error (e.g. a unique violation error). That + * won't cause Valgrind false positives. + * + * The nbtree client requests are superimposed on top of the + * bufmgr.c buffer pin client requests. In the event of an nbtree + * error the buffer will certainly get marked as defined when the + * backend once again acquires its first pin on the buffer. (Of + * course, if the backend never touches the buffer again then it + * doesn't matter that it remains non-accessible to Valgrind.) + * + * Note: When an IndexTuple C pointer gets computed using an + * ItemId read from a page while a lock was held, the C pointer + * becomes unsafe to dereference forever as soon as the lock is + * released. Valgrind can only detect cases where the pointer + * gets dereferenced with no _current_ lock/pin held, though. + */ + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); +} + +/* + * _bt_unlockbuf() -- unlock a pinned buffer. + */ +void +_bt_unlockbuf(Relation rel, Buffer buf) +{ + /* + * Buffer is pinned and locked, which means that it is expected to be + * defined and addressable. Check that proactively. + */ + VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ); + + /* LockBuffer() asserts that pin is held by this backend */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ); +} + +/* + * _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned + * buffer. + * + * Note: Caller may need to call _bt_checkpage() with buf when pin on buf + * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf(). + */ +bool +_bt_conditionallockbuf(Relation rel, Buffer buf) +{ + /* ConditionalLockBuffer() asserts that pin is held by this backend */ + if (!ConditionalLockBuffer(buf)) + return false; + + if (!RelationUsesLocalBuffers(rel)) + VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); + + return true; +} + +/* + * _bt_upgradelockbufcleanup() -- upgrade lock to super-exclusive/cleanup + * lock. + */ +void +_bt_upgradelockbufcleanup(Relation rel, Buffer buf) +{ + /* + * Buffer is pinned and locked, which means that it is expected to be + * defined and addressable. Check that proactively. + */ + VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ); + + /* LockBuffer() asserts that pin is held by this backend */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBufferForCleanup(buf); } /* @@ -1580,7 +1687,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) * To avoid deadlocks, we'd better drop the leaf page lock * before going further. */ - LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK); + _bt_unlockbuf(rel, leafbuf); /* * Check that the left sibling of leafbuf (if any) is not @@ -1616,7 +1723,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact) * (Page deletion can cope with the stack being to the left of * leafbuf, but not to the right of leafbuf.) */ - LockBuffer(leafbuf, BT_WRITE); + _bt_lockbuf(rel, leafbuf, BT_WRITE); continue; } @@ -1970,7 +2077,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, leafleftsib = opaque->btpo_prev; leafrightsib = opaque->btpo_next; - LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK); + _bt_unlockbuf(rel, leafbuf); /* * Check here, as calling loops will have locks held, preventing @@ -2005,7 +2112,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, * To avoid deadlocks, we'd better drop the target page lock before * going further. */ - LockBuffer(buf, BUFFER_LOCK_UNLOCK); + _bt_unlockbuf(rel, buf); } /* @@ -2022,7 +2129,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, * table.) */ if (target != leafblkno) - LockBuffer(leafbuf, BT_WRITE); + _bt_lockbuf(rel, leafbuf, BT_WRITE); if (leftsib != P_NONE) { lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); @@ -2072,7 +2179,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, * rather than a superexclusive lock, since no scan will stop on an empty * page. */ - LockBuffer(buf, BT_WRITE); + _bt_lockbuf(rel, buf, BT_WRITE); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index e947addef6b..d65f4357cc8 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -1115,7 +1115,7 @@ backtrack: */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); - LockBuffer(buf, BT_READ); + _bt_lockbuf(rel, buf, BT_READ); page = BufferGetPage(buf); opaque = NULL; if (!PageIsNew(page)) @@ -1222,8 +1222,7 @@ backtrack: * course of the vacuum scan, whether or not it actually contains any * deletable tuples --- see nbtree/README. */ - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - LockBufferForCleanup(buf); + _bt_upgradelockbufcleanup(rel, buf); /* * Check whether we need to backtrack to earlier pages. What we are diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index f228c87a2b7..28dc196b55e 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -64,7 +64,7 @@ static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) { - LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK); + _bt_unlockbuf(scan->indexRelation, sp->buf); if (IsMVCCSnapshot(scan->xs_snapshot) && RelationNeedsWAL(scan->indexRelation) && @@ -187,14 +187,13 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, if (access == BT_WRITE && page_access == BT_READ) { /* trade in our read lock for a write lock */ - LockBuffer(*bufP, BUFFER_LOCK_UNLOCK); - LockBuffer(*bufP, BT_WRITE); + _bt_unlockbuf(rel, *bufP); + _bt_lockbuf(rel, *bufP, BT_WRITE); /* - * If the page was split between the time that we surrendered our read - * lock and acquired our write lock, then this page may no longer be - * the right place for the key we want to insert. In this case, we - * need to move right in the tree. + * Race -- the leaf page may have split after we dropped the read lock + * but before we acquired a write lock. If it has, we may need to + * move right to its new sibling. Do that. */ *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE, snapshot); @@ -289,8 +288,8 @@ _bt_moveright(Relation rel, /* upgrade our lock if necessary */ if (access == BT_READ) { - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - LockBuffer(buf, BT_WRITE); + _bt_unlockbuf(rel, buf); + _bt_lockbuf(rel, buf, BT_WRITE); } if (P_INCOMPLETE_SPLIT(opaque)) @@ -1413,7 +1412,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * There's no actually-matching data on this page. Try to advance to * the next page. Return false if there's no matching data at all. */ - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); if (!_bt_steppage(scan, dir)) return false; } @@ -2061,7 +2060,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) * deleted. */ if (BTScanPosIsPinned(so->currPos)) - LockBuffer(so->currPos.buf, BT_READ); + _bt_lockbuf(rel, so->currPos.buf, BT_READ); else so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); @@ -2439,7 +2438,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) * There's no actually-matching data on this page. Try to advance to * the next page. Return false if there's no matching data at all. */ - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); if (!_bt_steppage(scan, dir)) return false; } diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 7c33711a9f3..81589b9056d 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -1744,7 +1744,7 @@ _bt_killitems(IndexScanDesc scan) * LSN. */ droppedpin = false; - LockBuffer(so->currPos.buf, BT_READ); + _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ); page = BufferGetPage(so->currPos.buf); } @@ -1885,7 +1885,7 @@ _bt_killitems(IndexScanDesc scan) MarkBufferDirtyHint(so->currPos.buf, true); } - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 9b9303ff650..f1ae6f9f844 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1639,8 +1639,8 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy) * Assume that we acquired a buffer pin for the purposes of * Valgrind buffer client checks (even in !result case) to * keep things simple. Buffers that are unsafe to access are - * not generally guaranteed to be marked undefined in any - * case. + * not generally guaranteed to be marked undefined or + * non-accessible in any case. */ VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ); break; @@ -1649,7 +1649,16 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy) } else { - /* If we previously pinned the buffer, it must surely be valid */ + /* + * If we previously pinned the buffer, it must surely be valid. + * + * Note: We deliberately avoid a Valgrind client request here. + * Individual access methods can optionally superimpose buffer page + * client requests on top of our client requests to enforce that + * buffers are only accessed while locked (and pinned). It's possible + * that the buffer page is legitimately non-accessible here. We + * cannot meddle with that. + */ result = true; } @@ -1745,7 +1754,13 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner) uint32 buf_state; uint32 old_buf_state; - /* Mark undefined, now that no pins remain in backend */ + /* + * Mark buffer non-accessible to Valgrind. + * + * Note that the buffer may have already been marked non-accessible + * within access method code that enforces that buffers are only + * accessed while a buffer lock is held. + */ VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ); /* I'd better not still hold any locks on the buffer */ diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 79506c748b2..f5274cc7508 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1073,6 +1073,10 @@ extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access); extern void _bt_relbuf(Relation rel, Buffer buf); +extern void _bt_lockbuf(Relation rel, Buffer buf, int access); +extern void _bt_unlockbuf(Relation rel, Buffer buf); +extern bool _bt_conditionallockbuf(Relation rel, Buffer buf); +extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf); extern void _bt_pageinit(Page page, Size size); extern bool _bt_page_recyclable(Page page); extern void _bt_delitems_vacuum(Relation rel, Buffer buf, diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index 45b6a457896..705dc69c06a 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -271,11 +271,13 @@ * Valgrind understands PostgreSQL memory contexts. This permits detecting * memory errors that Valgrind would not detect on a vanilla build. It also * enables detection of buffer accesses that take place without holding a - * buffer pin. See also src/tools/valgrind.supp. + * buffer pin (or without holding a buffer lock in the case of index access + * methods that superimpose their own custom client requests on top of the + * generic bufmgr.c requests). See also src/tools/valgrind.supp. * * "make installcheck" is significantly slower under Valgrind. The client - * requests fall in hot code paths, so USE_VALGRIND slows native execution by - * a few percentage points even when not run under Valgrind. + * requests fall in hot code paths, so USE_VALGRIND slows execution by a few + * percentage points even when not run under Valgrind. * * You should normally use MEMORY_CONTEXT_CHECKING with USE_VALGRIND; * instrumentation of repalloc() is inferior without it. |