diff options
author | Peter Geoghegan <pg@bowt.ie> | 2025-03-08 11:10:14 -0500 |
---|---|---|
committer | Peter Geoghegan <pg@bowt.ie> | 2025-03-08 11:10:14 -0500 |
commit | 67fc4c9fd7fab7004b656e0cc27826c75d7ea7ad (patch) | |
tree | 2e0555a4b537036caa392a64aa079c5552334204 /src | |
parent | 8021c77769e90cc804121d61a1bb7bcc4652d48b (diff) | |
download | postgresql-67fc4c9fd7fab7004b656e0cc27826c75d7ea7ad.tar.gz postgresql-67fc4c9fd7fab7004b656e0cc27826c75d7ea7ad.zip |
Make parallel nbtree index scans use an LWLock.
Teach parallel nbtree index scans to use an LWLock (not a spinlock) to
protect the scan's shared descriptor state.
Preparation for an upcoming patch that will add skip scan optimizations
to nbtree. That patch will create the need to occasionally allocate
memory while the scan descriptor is locked, while copying datums that
were serialized by another backend.
Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Matthias van de Meent <boekewurm+postgres@gmail.com>
Discussion: https://postgr.es/m/CAH2-Wz=PKR6rB7qbx+Vnd7eqeB5VTcrW=iJvAsTsKbdG+kW_UA@mail.gmail.com
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/access/nbtree/nbtpreprocesskeys.c | 2 | ||||
-rw-r--r-- | src/backend/access/nbtree/nbtree.c | 27 | ||||
-rw-r--r-- | src/backend/storage/lmgr/lwlock.c | 1 | ||||
-rw-r--r-- | src/backend/utils/activity/wait_event_names.txt | 1 | ||||
-rw-r--r-- | src/include/storage/lwlock.h | 1 |
5 files changed, 18 insertions, 14 deletions
diff --git a/src/backend/access/nbtree/nbtpreprocesskeys.c b/src/backend/access/nbtree/nbtpreprocesskeys.c index 1fd1da5f18b..38a87af1cc8 100644 --- a/src/backend/access/nbtree/nbtpreprocesskeys.c +++ b/src/backend/access/nbtree/nbtpreprocesskeys.c @@ -1565,7 +1565,7 @@ _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap) * Parallel index scans require space in shared memory to store the * current array elements (for arrays kept by preprocessing) to schedule * the next primitive index scan. The underlying structure is protected - * using a spinlock, so defensively limit its size. In practice this can + * using an LWLock, so defensively limit its size. In practice this can * only affect parallel scans that use an incomplete opfamily. */ if (scan->parallel_scan && so->numArrayKeys > INDEX_MAX_KEYS) diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 136e9408ae5..25188a644ef 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -70,7 +70,7 @@ typedef struct BTParallelScanDescData BTPS_State btps_pageStatus; /* indicates whether next page is * available for scan. see above for * possible states of parallel scan. */ - slock_t btps_mutex; /* protects above variables, btps_arrElems */ + LWLock btps_lock; /* protects shared parallel state */ ConditionVariable btps_cv; /* used to synchronize parallel scan */ /* @@ -554,7 +554,8 @@ btinitparallelscan(void *target) { BTParallelScanDesc bt_target = (BTParallelScanDesc) target; - SpinLockInit(&bt_target->btps_mutex); + LWLockInitialize(&bt_target->btps_lock, + LWTRANCHE_PARALLEL_BTREE_SCAN); bt_target->btps_nextScanPage = InvalidBlockNumber; bt_target->btps_lastCurrPage = InvalidBlockNumber; bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; @@ -576,15 +577,15 @@ btparallelrescan(IndexScanDesc scan) parallel_scan->ps_offset); /* - * In theory, we don't need to acquire the spinlock here, because there + * In theory, we don't need to acquire the LWLock here, because there * shouldn't be any other workers running at this point, but we do so for * consistency. */ - SpinLockAcquire(&btscan->btps_mutex); + LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE); btscan->btps_nextScanPage = InvalidBlockNumber; btscan->btps_lastCurrPage = InvalidBlockNumber; btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; - SpinLockRelease(&btscan->btps_mutex); + LWLockRelease(&btscan->btps_lock); } /* @@ -655,7 +656,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, while (1) { - SpinLockAcquire(&btscan->btps_mutex); + LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE); if (btscan->btps_pageStatus == BTPARALLEL_DONE) { @@ -717,7 +718,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *next_scan_page, *last_curr_page = btscan->btps_lastCurrPage; exit_loop = true; } - SpinLockRelease(&btscan->btps_mutex); + LWLockRelease(&btscan->btps_lock); if (exit_loop || !status) break; ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE); @@ -761,11 +762,11 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan, parallel_scan->ps_offset); - SpinLockAcquire(&btscan->btps_mutex); + LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE); btscan->btps_nextScanPage = next_scan_page; btscan->btps_lastCurrPage = curr_page; btscan->btps_pageStatus = BTPARALLEL_IDLE; - SpinLockRelease(&btscan->btps_mutex); + LWLockRelease(&btscan->btps_lock); ConditionVariableSignal(&btscan->btps_cv); } @@ -804,14 +805,14 @@ _bt_parallel_done(IndexScanDesc scan) * Mark the parallel scan as done, unless some other process did so * already */ - SpinLockAcquire(&btscan->btps_mutex); + LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE); Assert(btscan->btps_pageStatus != BTPARALLEL_NEED_PRIMSCAN); if (btscan->btps_pageStatus != BTPARALLEL_DONE) { btscan->btps_pageStatus = BTPARALLEL_DONE; status_changed = true; } - SpinLockRelease(&btscan->btps_mutex); + LWLockRelease(&btscan->btps_lock); /* wake up all the workers associated with this parallel scan */ if (status_changed) @@ -838,7 +839,7 @@ _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page) btscan = (BTParallelScanDesc) OffsetToPointer(parallel_scan, parallel_scan->ps_offset); - SpinLockAcquire(&btscan->btps_mutex); + LWLockAcquire(&btscan->btps_lock, LW_EXCLUSIVE); if (btscan->btps_lastCurrPage == curr_page && btscan->btps_pageStatus == BTPARALLEL_IDLE) { @@ -854,7 +855,7 @@ _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page) btscan->btps_arrElems[i] = array->cur_elem; } } - SpinLockRelease(&btscan->btps_mutex); + LWLockRelease(&btscan->btps_lock); } /* diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 8adf2730277..5702c35bb91 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -153,6 +153,7 @@ static const char *const BuiltinTrancheNames[] = { [LWTRANCHE_LOCK_MANAGER] = "LockManager", [LWTRANCHE_PREDICATE_LOCK_MANAGER] = "PredicateLockManager", [LWTRANCHE_PARALLEL_HASH_JOIN] = "ParallelHashJoin", + [LWTRANCHE_PARALLEL_BTREE_SCAN] = "ParallelBtreeScan", [LWTRANCHE_PARALLEL_QUERY_DSA] = "ParallelQueryDSA", [LWTRANCHE_PER_SESSION_DSA] = "PerSessionDSA", [LWTRANCHE_PER_SESSION_RECORD_TYPE] = "PerSessionRecordType", diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index e199f071628..3c594415bfd 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -371,6 +371,7 @@ BufferMapping "Waiting to associate a data block with a buffer in the buffer poo LockManager "Waiting to read or update information about <quote>heavyweight</quote> locks." PredicateLockManager "Waiting to access predicate lock information used by serializable transactions." ParallelHashJoin "Waiting to synchronize workers during Parallel Hash Join plan execution." +ParallelBtreeScan "Waiting to synchronize workers during Parallel B-tree scan plan execution." ParallelQueryDSA "Waiting for parallel query dynamic shared memory allocation." PerSessionDSA "Waiting for parallel query dynamic shared memory allocation." PerSessionRecordType "Waiting to access a parallel query's information about composite types." diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 13a7dc89980..ffa03189e2d 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -194,6 +194,7 @@ typedef enum BuiltinTrancheIds LWTRANCHE_LOCK_MANAGER, LWTRANCHE_PREDICATE_LOCK_MANAGER, LWTRANCHE_PARALLEL_HASH_JOIN, + LWTRANCHE_PARALLEL_BTREE_SCAN, LWTRANCHE_PARALLEL_QUERY_DSA, LWTRANCHE_PER_SESSION_DSA, LWTRANCHE_PER_SESSION_RECORD_TYPE, |