diff options
Diffstat (limited to 'src/backend/storage')
-rw-r--r-- | src/backend/storage/aio/aio.c | 140 | ||||
-rw-r--r-- | src/backend/storage/aio/aio_callback.c | 7 | ||||
-rw-r--r-- | src/backend/storage/aio/aio_io.c | 4 | ||||
-rw-r--r-- | src/backend/storage/aio/method_io_uring.c | 8 | ||||
-rw-r--r-- | src/backend/storage/aio/method_worker.c | 7 | ||||
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 10 | ||||
-rw-r--r-- | src/backend/storage/buffer/localbuf.c | 23 | ||||
-rw-r--r-- | src/backend/storage/file/fd.c | 19 | ||||
-rw-r--r-- | src/backend/storage/ipc/dsm_registry.c | 265 | ||||
-rw-r--r-- | src/backend/storage/ipc/ipci.c | 3 | ||||
-rw-r--r-- | src/backend/storage/ipc/procsignal.c | 3 | ||||
-rw-r--r-- | src/backend/storage/ipc/shmem.c | 4 | ||||
-rw-r--r-- | src/backend/storage/lmgr/lmgr.c | 6 | ||||
-rw-r--r-- | src/backend/storage/lmgr/lock.c | 2 | ||||
-rw-r--r-- | src/backend/storage/lmgr/lwlock.c | 2 | ||||
-rw-r--r-- | src/backend/storage/lmgr/proc.c | 1 |
16 files changed, 426 insertions, 78 deletions
diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c index ebb5a771bfd..3643f27ad6e 100644 --- a/src/backend/storage/aio/aio.c +++ b/src/backend/storage/aio/aio.c @@ -184,6 +184,8 @@ pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret) PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret) { + PgAioHandle *ioh = NULL; + if (pgaio_my_backend->num_staged_ios >= PGAIO_SUBMIT_BATCH_SIZE) { Assert(pgaio_my_backend->num_staged_ios == PGAIO_SUBMIT_BATCH_SIZE); @@ -193,10 +195,17 @@ pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret) if (pgaio_my_backend->handed_out_io) elog(ERROR, "API violation: Only one IO can be handed out"); + /* + * Probably not needed today, as interrupts should not process this IO, + * but... + */ + HOLD_INTERRUPTS(); + if (!dclist_is_empty(&pgaio_my_backend->idle_ios)) { dlist_node *ion = dclist_pop_head_node(&pgaio_my_backend->idle_ios); - PgAioHandle *ioh = dclist_container(PgAioHandle, node, ion); + + ioh = dclist_container(PgAioHandle, node, ion); Assert(ioh->state == PGAIO_HS_IDLE); Assert(ioh->owner_procno == MyProcNumber); @@ -212,11 +221,11 @@ pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret) ioh->report_return = ret; ret->result.status = PGAIO_RS_UNKNOWN; } - - return ioh; } - return NULL; + RESUME_INTERRUPTS(); + + return ioh; } /* @@ -233,6 +242,12 @@ pgaio_io_release(PgAioHandle *ioh) Assert(ioh->resowner); pgaio_my_backend->handed_out_io = NULL; + + /* + * Note that no interrupts are processed between the handed_out_io + * check and the call to reclaim - that's important as otherwise an + * interrupt could have already reclaimed the handle. + */ pgaio_io_reclaim(ioh); } else @@ -251,6 +266,12 @@ pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error) Assert(ioh->resowner); + /* + * Otherwise an interrupt, in the middle of releasing the IO, could end up + * trying to wait for the IO, leading to state confusion. + */ + HOLD_INTERRUPTS(); + ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node); ioh->resowner = NULL; @@ -291,6 +312,8 @@ pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error) */ if (ioh->report_return) ioh->report_return = NULL; + + RESUME_INTERRUPTS(); } /* @@ -359,6 +382,13 @@ pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow) static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state) { + /* + * All callers need to have held interrupts in some form, otherwise + * interrupt processing could wait for the IO to complete, while in an + * intermediary state. + */ + Assert(!INTERRUPTS_CAN_BE_PROCESSED()); + pgaio_debug_io(DEBUG5, ioh, "updating state to %s", pgaio_io_state_get_name(new_state)); @@ -396,6 +426,13 @@ pgaio_io_stage(PgAioHandle *ioh, PgAioOp op) Assert(pgaio_my_backend->handed_out_io == ioh); Assert(pgaio_io_has_target(ioh)); + /* + * Otherwise an interrupt, in the middle of staging and possibly executing + * the IO, could end up trying to wait for the IO, leading to state + * confusion. + */ + HOLD_INTERRUPTS(); + ioh->op = op; ioh->result = 0; @@ -435,6 +472,8 @@ pgaio_io_stage(PgAioHandle *ioh, PgAioOp op) pgaio_io_prepare_submit(ioh); pgaio_io_perform_synchronously(ioh); } + + RESUME_INTERRUPTS(); } bool @@ -517,6 +556,13 @@ bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state) { *state = ioh->state; + + /* + * Ensure that we don't see an earlier state of the handle than ioh->state + * due to compiler or CPU reordering. This protects both ->generation as + * directly used here, and other fields in the handle accessed in the + * caller if the handle was not reused. + */ pg_read_barrier(); return ioh->generation != ref_generation; @@ -544,8 +590,8 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation) && state != PGAIO_HS_COMPLETED_SHARED && state != PGAIO_HS_COMPLETED_LOCAL) { - elog(PANIC, "waiting for own IO in wrong state: %d", - state); + elog(PANIC, "waiting for own IO %d in wrong state: %s", + pgaio_io_get_id(ioh), pgaio_io_get_state_name(ioh)); } } @@ -599,7 +645,13 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation) case PGAIO_HS_COMPLETED_SHARED: case PGAIO_HS_COMPLETED_LOCAL: - /* see above */ + + /* + * Note that no interrupts are processed between + * pgaio_io_was_recycled() and this check - that's important + * as otherwise an interrupt could have already reclaimed the + * handle. + */ if (am_owner) pgaio_io_reclaim(ioh); return; @@ -610,6 +662,11 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation) /* * Make IO handle ready to be reused after IO has completed or after the * handle has been released without being used. + * + * Note that callers need to be careful about only calling this in the right + * state and that no interrupts can be processed between the state check and + * the call to pgaio_io_reclaim(). Otherwise interrupt processing could + * already have reclaimed the handle. */ static void pgaio_io_reclaim(PgAioHandle *ioh) @@ -618,6 +675,9 @@ pgaio_io_reclaim(PgAioHandle *ioh) Assert(ioh->owner_procno == MyProcNumber); Assert(ioh->state != PGAIO_HS_IDLE); + /* see comment in function header */ + HOLD_INTERRUPTS(); + /* * It's a bit ugly, but right now the easiest place to put the execution * of local completion callbacks is this function, as we need to execute @@ -685,6 +745,8 @@ pgaio_io_reclaim(PgAioHandle *ioh) * efficient in cases where only a few IOs are used. */ dclist_push_head(&pgaio_my_backend->idle_ios, &ioh->node); + + RESUME_INTERRUPTS(); } /* @@ -697,10 +759,10 @@ pgaio_io_wait_for_free(void) { int reclaimed = 0; - pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %d in-flight, %d idle IOs", + pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %u in-flight, %u idle IOs", pgaio_my_backend->num_staged_ios, dclist_count(&pgaio_my_backend->in_flight_ios), - dclist_is_empty(&pgaio_my_backend->idle_ios)); + dclist_count(&pgaio_my_backend->idle_ios)); /* * First check if any of our IOs actually have completed - when using @@ -714,6 +776,16 @@ pgaio_io_wait_for_free(void) if (ioh->state == PGAIO_HS_COMPLETED_SHARED) { + /* + * Note that no interrupts are processed between the state check + * and the call to reclaim - that's important as otherwise an + * interrupt could have already reclaimed the handle. + * + * Need to ensure that there's no reordering, in the more common + * paths, where we wait for IO, that's done by + * pgaio_io_was_recycled(). + */ + pg_read_barrier(); pgaio_io_reclaim(ioh); reclaimed++; } @@ -730,13 +802,17 @@ pgaio_io_wait_for_free(void) if (pgaio_my_backend->num_staged_ios > 0) pgaio_submit_staged(); + /* possibly some IOs finished during submission */ + if (!dclist_is_empty(&pgaio_my_backend->idle_ios)) + return; + if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0) ereport(ERROR, errmsg_internal("no free IOs despite no in-flight IOs"), - errdetail_internal("%d pending, %d in-flight, %d idle IOs", + errdetail_internal("%d pending, %u in-flight, %u idle IOs", pgaio_my_backend->num_staged_ios, dclist_count(&pgaio_my_backend->in_flight_ios), - dclist_is_empty(&pgaio_my_backend->idle_ios))); + dclist_count(&pgaio_my_backend->idle_ios))); /* * Wait for the oldest in-flight IO to complete. @@ -747,6 +823,7 @@ pgaio_io_wait_for_free(void) { PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios); + uint64 generation = ioh->generation; switch (ioh->state) { @@ -763,20 +840,36 @@ pgaio_io_wait_for_free(void) case PGAIO_HS_COMPLETED_IO: case PGAIO_HS_SUBMITTED: pgaio_debug_io(DEBUG2, ioh, - "waiting for free io with %d in flight", + "waiting for free io with %u in flight", dclist_count(&pgaio_my_backend->in_flight_ios)); /* * In a more general case this would be racy, because the * generation could increase after we read ioh->state above. * But we are only looking at IOs by the current backend and - * the IO can only be recycled by this backend. + * the IO can only be recycled by this backend. Even this is + * only OK because we get the handle's generation before + * potentially processing interrupts, e.g. as part of + * pgaio_debug_io(). */ - pgaio_io_wait(ioh, ioh->generation); + pgaio_io_wait(ioh, generation); break; case PGAIO_HS_COMPLETED_SHARED: - /* it's possible that another backend just finished this IO */ + + /* + * It's possible that another backend just finished this IO. + * + * Note that no interrupts are processed between the state + * check and the call to reclaim - that's important as + * otherwise an interrupt could have already reclaimed the + * handle. + * + * Need to ensure that there's no reordering, in the more + * common paths, where we wait for IO, that's done by + * pgaio_io_was_recycled(). + */ + pg_read_barrier(); pgaio_io_reclaim(ioh); break; } @@ -926,6 +1019,11 @@ pgaio_wref_check_done(PgAioWaitRef *iow) if (state == PGAIO_HS_COMPLETED_SHARED || state == PGAIO_HS_COMPLETED_LOCAL) { + /* + * Note that no interrupts are processed between + * pgaio_io_was_recycled() and this check - that's important as + * otherwise an interrupt could have already reclaimed the handle. + */ if (am_owner) pgaio_io_reclaim(ioh); return true; @@ -1153,11 +1251,14 @@ pgaio_closing_fd(int fd) { dlist_iter iter; PgAioHandle *ioh = NULL; + uint64 generation; dclist_foreach(iter, &pgaio_my_backend->in_flight_ios) { ioh = dclist_container(PgAioHandle, node, iter.cur); + generation = ioh->generation; + if (pgaio_io_uses_fd(ioh, fd)) break; else @@ -1168,11 +1269,11 @@ pgaio_closing_fd(int fd) break; pgaio_debug_io(DEBUG2, ioh, - "waiting for IO before FD %d gets closed, %d in-flight IOs", + "waiting for IO before FD %d gets closed, %u in-flight IOs", fd, dclist_count(&pgaio_my_backend->in_flight_ios)); /* see comment in pgaio_io_wait_for_free() about raciness */ - pgaio_io_wait(ioh, ioh->generation); + pgaio_io_wait(ioh, generation); } } } @@ -1201,13 +1302,14 @@ pgaio_shutdown(int code, Datum arg) while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios)) { PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios); + uint64 generation = ioh->generation; pgaio_debug_io(DEBUG2, ioh, - "waiting for IO to complete during shutdown, %d in-flight IOs", + "waiting for IO to complete during shutdown, %u in-flight IOs", dclist_count(&pgaio_my_backend->in_flight_ios)); /* see comment in pgaio_io_wait_for_free() about raciness */ - pgaio_io_wait(ioh, ioh->generation); + pgaio_io_wait(ioh, generation); } pgaio_my_backend = NULL; diff --git a/src/backend/storage/aio/aio_callback.c b/src/backend/storage/aio/aio_callback.c index 0ad9795bb7e..03c9bba0802 100644 --- a/src/backend/storage/aio/aio_callback.c +++ b/src/backend/storage/aio/aio_callback.c @@ -256,6 +256,9 @@ pgaio_io_call_complete_shared(PgAioHandle *ioh) pgaio_result_status_string(result.status), result.id, result.error_data, result.result); result = ce->cb->complete_shared(ioh, result, cb_data); + + /* the callback should never transition to unknown */ + Assert(result.status != PGAIO_RS_UNKNOWN); } ioh->distilled_result = result; @@ -290,6 +293,7 @@ pgaio_io_call_complete_local(PgAioHandle *ioh) /* start with distilled result from shared callback */ result = ioh->distilled_result; + Assert(result.status != PGAIO_RS_UNKNOWN); for (int i = ioh->num_callbacks; i > 0; i--) { @@ -306,6 +310,9 @@ pgaio_io_call_complete_local(PgAioHandle *ioh) pgaio_result_status_string(result.status), result.id, result.error_data, result.result); result = ce->cb->complete_local(ioh, result, cb_data); + + /* the callback should never transition to unknown */ + Assert(result.status != PGAIO_RS_UNKNOWN); } /* diff --git a/src/backend/storage/aio/aio_io.c b/src/backend/storage/aio/aio_io.c index 00e176135a6..520b5077df2 100644 --- a/src/backend/storage/aio/aio_io.c +++ b/src/backend/storage/aio/aio_io.c @@ -181,9 +181,9 @@ pgaio_io_get_op_name(PgAioHandle *ioh) case PGAIO_OP_INVALID: return "invalid"; case PGAIO_OP_READV: - return "read"; + return "readv"; case PGAIO_OP_WRITEV: - return "write"; + return "writev"; } return NULL; /* silence compiler */ diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c index c719ba2727a..b78048328e1 100644 --- a/src/backend/storage/aio/method_io_uring.c +++ b/src/backend/storage/aio/method_io_uring.c @@ -126,7 +126,7 @@ pgaio_uring_shmem_size(void) static void pgaio_uring_shmem_init(bool first_time) { - int TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS; + int TotalProcs = pgaio_uring_procs(); bool found; pgaio_uring_contexts = (PgAioUringContext *) @@ -400,9 +400,9 @@ pgaio_uring_wait_one(PgAioHandle *ioh, uint64 ref_generation) while (true) { pgaio_debug_io(DEBUG3, ioh, - "wait_one io_gen: %llu, ref_gen: %llu, cycle %d", - (long long unsigned) ioh->generation, - (long long unsigned) ref_generation, + "wait_one io_gen: %" PRIu64 ", ref_gen: %" PRIu64 ", cycle %d", + ioh->generation, + ref_generation, waited); if (pgaio_io_was_recycled(ioh, ref_generation, &state) || diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c index 743cccc2acd..36be179678d 100644 --- a/src/backend/storage/aio/method_worker.c +++ b/src/backend/storage/aio/method_worker.c @@ -461,7 +461,12 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len) int nwakeups = 0; int worker; - /* Try to get a job to do. */ + /* + * Try to get a job to do. + * + * The lwlock acquisition also provides the necessary memory barrier + * to ensure that we don't see an outdated data in the handle. + */ LWLockAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE); if ((io_index = pgaio_worker_submission_queue_consume()) == UINT32_MAX) { diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index f93131a645e..bd68d7e0ca9 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -4550,11 +4550,9 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, if (RelFileLocatorBackendIsTemp(rlocator)) { if (rlocator.backend == MyProcNumber) - { - for (j = 0; j < nforks; j++) - DropRelationLocalBuffers(rlocator.locator, forkNum[j], - firstDelBlock[j]); - } + DropRelationLocalBuffers(rlocator.locator, forkNum, nforks, + firstDelBlock); + return; } @@ -7320,7 +7318,7 @@ buffer_readv_report(PgAioResult result, const PgAioTargetData *td, affected_count > 1 ? errdetail("Block %u held first zeroed page.", first + first_off) : 0, - errhint("See server log for details about the other %u invalid block(s).", + errhint("See server log for details about the other %d invalid block(s).", affected_count + checkfail_count - 1)); return; } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 63101d56a07..3da9c41ee1d 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -629,7 +629,7 @@ InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced) */ if (check_unreferenced && (LocalRefCount[bufid] != 0 || BUF_STATE_GET_REFCOUNT(buf_state) != 0)) - elog(ERROR, "block %u of %s is still referenced (local %u)", + elog(ERROR, "block %u of %s is still referenced (local %d)", bufHdr->tag.blockNum, relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag), MyProcNumber, @@ -660,10 +660,11 @@ InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced) * See DropRelationBuffers in bufmgr.c for more notes. */ void -DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, - BlockNumber firstDelBlock) +DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, + int nforks, BlockNumber *firstDelBlock) { int i; + int j; for (i = 0; i < NLocBuffer; i++) { @@ -672,12 +673,18 @@ DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, buf_state = pg_atomic_read_u32(&bufHdr->state); - if ((buf_state & BM_TAG_VALID) && - BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) && - BufTagGetForkNum(&bufHdr->tag) == forkNum && - bufHdr->tag.blockNum >= firstDelBlock) + if (!(buf_state & BM_TAG_VALID) || + !BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator)) + continue; + + for (j = 0; j < nforks; j++) { - InvalidateLocalBuffer(bufHdr, true); + if (BufTagGetForkNum(&bufHdr->tag) == forkNum[j] && + bufHdr->tag.blockNum >= firstDelBlock[j]) + { + InvalidateLocalBuffer(bufHdr, true); + break; + } } } } diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 0e8299dd556..a4ec7959f31 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -400,25 +400,22 @@ pg_fsync(int fd) * portable, even if it runs ok on the current system. * * We assert here that a descriptor for a file was opened with write - * permissions (either O_RDWR or O_WRONLY) and for a directory without - * write permissions (O_RDONLY). + * permissions (i.e., not O_RDONLY) and for a directory without write + * permissions (O_RDONLY). Notice that the assertion check is made even + * if fsync() is disabled. * - * Ignore any fstat errors and let the follow-up fsync() do its work. - * Doing this sanity check here counts for the case where fsync() is - * disabled. + * If fstat() fails, ignore it and let the follow-up fsync() complain. */ if (fstat(fd, &st) == 0) { int desc_flags = fcntl(fd, F_GETFL); - /* - * O_RDONLY is historically 0, so just make sure that for directories - * no write flags are used. - */ + desc_flags &= O_ACCMODE; + if (S_ISDIR(st.st_mode)) - Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0); + Assert(desc_flags == O_RDONLY); else - Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0); + Assert(desc_flags != O_RDONLY); } errno = 0; #endif diff --git a/src/backend/storage/ipc/dsm_registry.c b/src/backend/storage/ipc/dsm_registry.c index 1d4fd31ffed..828c2ff0c7f 100644 --- a/src/backend/storage/ipc/dsm_registry.c +++ b/src/backend/storage/ipc/dsm_registry.c @@ -15,6 +15,20 @@ * current backend. This function guarantees that only one backend * initializes the segment and that all other backends just attach it. * + * A DSA can be created in or retrieved from the registry by calling + * GetNamedDSA(). As with GetNamedDSMSegment(), if a DSA with the provided + * name does not yet exist, it is created. Otherwise, GetNamedDSA() + * ensures the DSA is attached to the current backend. This function + * guarantees that only one backend initializes the DSA and that all other + * backends just attach it. + * + * A dshash table can be created in or retrieved from the registry by + * calling GetNamedDSHash(). As with GetNamedDSMSegment(), if a hash + * table with the provided name does not yet exist, it is created. + * Otherwise, GetNamedDSHash() ensures the hash table is attached to the + * current backend. This function guarantees that only one backend + * initializes the table and that all other backends just attach it. + * * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -32,6 +46,12 @@ #include "storage/shmem.h" #include "utils/memutils.h" +#define DSMR_NAME_LEN 128 + +#define DSMR_DSA_TRANCHE_SUFFIX " DSA" +#define DSMR_DSA_TRANCHE_SUFFIX_LEN (sizeof(DSMR_DSA_TRANCHE_SUFFIX) - 1) +#define DSMR_DSA_TRANCHE_NAME_LEN (DSMR_NAME_LEN + DSMR_DSA_TRANCHE_SUFFIX_LEN) + typedef struct DSMRegistryCtxStruct { dsa_handle dsah; @@ -40,15 +60,48 @@ typedef struct DSMRegistryCtxStruct static DSMRegistryCtxStruct *DSMRegistryCtx; -typedef struct DSMRegistryEntry +typedef struct NamedDSMState { - char name[64]; dsm_handle handle; size_t size; +} NamedDSMState; + +typedef struct NamedDSAState +{ + dsa_handle handle; + int tranche; + char tranche_name[DSMR_DSA_TRANCHE_NAME_LEN]; +} NamedDSAState; + +typedef struct NamedDSHState +{ + NamedDSAState dsa; + dshash_table_handle handle; + int tranche; + char tranche_name[DSMR_NAME_LEN]; +} NamedDSHState; + +typedef enum DSMREntryType +{ + DSMR_ENTRY_TYPE_DSM, + DSMR_ENTRY_TYPE_DSA, + DSMR_ENTRY_TYPE_DSH, +} DSMREntryType; + +typedef struct DSMRegistryEntry +{ + char name[DSMR_NAME_LEN]; + DSMREntryType type; + union + { + NamedDSMState dsm; + NamedDSAState dsa; + NamedDSHState dsh; + } data; } DSMRegistryEntry; static const dshash_parameters dsh_params = { - offsetof(DSMRegistryEntry, handle), + offsetof(DSMRegistryEntry, type), sizeof(DSMRegistryEntry), dshash_strcmp, dshash_strhash, @@ -141,7 +194,7 @@ GetNamedDSMSegment(const char *name, size_t size, ereport(ERROR, (errmsg("DSM segment name cannot be empty"))); - if (strlen(name) >= offsetof(DSMRegistryEntry, handle)) + if (strlen(name) >= offsetof(DSMRegistryEntry, type)) ereport(ERROR, (errmsg("DSM segment name too long"))); @@ -158,32 +211,39 @@ GetNamedDSMSegment(const char *name, size_t size, entry = dshash_find_or_insert(dsm_registry_table, name, found); if (!(*found)) { + NamedDSMState *state = &entry->data.dsm; + dsm_segment *seg; + + entry->type = DSMR_ENTRY_TYPE_DSM; + /* Initialize the segment. */ - dsm_segment *seg = dsm_create(size, 0); + seg = dsm_create(size, 0); dsm_pin_segment(seg); dsm_pin_mapping(seg); - entry->handle = dsm_segment_handle(seg); - entry->size = size; + state->handle = dsm_segment_handle(seg); + state->size = size; ret = dsm_segment_address(seg); if (init_callback) (*init_callback) (ret); } - else if (entry->size != size) - { + else if (entry->type != DSMR_ENTRY_TYPE_DSM) ereport(ERROR, - (errmsg("requested DSM segment size does not match size of " - "existing segment"))); - } + (errmsg("requested DSM segment does not match type of existing entry"))); + else if (entry->data.dsm.size != size) + ereport(ERROR, + (errmsg("requested DSM segment size does not match size of existing segment"))); else { - dsm_segment *seg = dsm_find_mapping(entry->handle); + NamedDSMState *state = &entry->data.dsm; + dsm_segment *seg; /* If the existing segment is not already attached, attach it now. */ + seg = dsm_find_mapping(state->handle); if (seg == NULL) { - seg = dsm_attach(entry->handle); + seg = dsm_attach(state->handle); if (seg == NULL) elog(ERROR, "could not map dynamic shared memory segment"); @@ -198,3 +258,180 @@ GetNamedDSMSegment(const char *name, size_t size, return ret; } + +/* + * Initialize or attach a named DSA. + * + * This routine returns a pointer to the DSA. A new LWLock tranche ID will be + * generated if needed. Note that the lock tranche will be registered with the + * provided name. Also note that this should be called at most once for a + * given DSA in each backend. + */ +dsa_area * +GetNamedDSA(const char *name, bool *found) +{ + DSMRegistryEntry *entry; + MemoryContext oldcontext; + dsa_area *ret; + + Assert(found); + + if (!name || *name == '\0') + ereport(ERROR, + (errmsg("DSA name cannot be empty"))); + + if (strlen(name) >= offsetof(DSMRegistryEntry, type)) + ereport(ERROR, + (errmsg("DSA name too long"))); + + /* Be sure any local memory allocated by DSM/DSA routines is persistent. */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + /* Connect to the registry. */ + init_dsm_registry(); + + entry = dshash_find_or_insert(dsm_registry_table, name, found); + if (!(*found)) + { + NamedDSAState *state = &entry->data.dsa; + + entry->type = DSMR_ENTRY_TYPE_DSA; + + /* Initialize the LWLock tranche for the DSA. */ + state->tranche = LWLockNewTrancheId(); + strcpy(state->tranche_name, name); + LWLockRegisterTranche(state->tranche, state->tranche_name); + + /* Initialize the DSA. */ + ret = dsa_create(state->tranche); + dsa_pin(ret); + dsa_pin_mapping(ret); + + /* Store handle for other backends to use. */ + state->handle = dsa_get_handle(ret); + } + else if (entry->type != DSMR_ENTRY_TYPE_DSA) + ereport(ERROR, + (errmsg("requested DSA does not match type of existing entry"))); + else + { + NamedDSAState *state = &entry->data.dsa; + + if (dsa_is_attached(state->handle)) + ereport(ERROR, + (errmsg("requested DSA already attached to current process"))); + + /* Initialize existing LWLock tranche for the DSA. */ + LWLockRegisterTranche(state->tranche, state->tranche_name); + + /* Attach to existing DSA. */ + ret = dsa_attach(state->handle); + dsa_pin_mapping(ret); + } + + dshash_release_lock(dsm_registry_table, entry); + MemoryContextSwitchTo(oldcontext); + + return ret; +} + +/* + * Initialize or attach a named dshash table. + * + * This routine returns the address of the table. The tranche_id member of + * params is ignored; new tranche IDs will be generated if needed. Note that + * the DSA lock tranche will be registered with the provided name with " DSA" + * appended. The dshash lock tranche will be registered with the provided + * name. Also note that this should be called at most once for a given table + * in each backend. + */ +dshash_table * +GetNamedDSHash(const char *name, const dshash_parameters *params, bool *found) +{ + DSMRegistryEntry *entry; + MemoryContext oldcontext; + dshash_table *ret; + + Assert(params); + Assert(found); + + if (!name || *name == '\0') + ereport(ERROR, + (errmsg("DSHash name cannot be empty"))); + + if (strlen(name) >= offsetof(DSMRegistryEntry, type)) + ereport(ERROR, + (errmsg("DSHash name too long"))); + + /* Be sure any local memory allocated by DSM/DSA routines is persistent. */ + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + /* Connect to the registry. */ + init_dsm_registry(); + + entry = dshash_find_or_insert(dsm_registry_table, name, found); + if (!(*found)) + { + NamedDSAState *dsa_state = &entry->data.dsh.dsa; + NamedDSHState *dsh_state = &entry->data.dsh; + dshash_parameters params_copy; + dsa_area *dsa; + + entry->type = DSMR_ENTRY_TYPE_DSH; + + /* Initialize the LWLock tranche for the DSA. */ + dsa_state->tranche = LWLockNewTrancheId(); + sprintf(dsa_state->tranche_name, "%s%s", name, DSMR_DSA_TRANCHE_SUFFIX); + LWLockRegisterTranche(dsa_state->tranche, dsa_state->tranche_name); + + /* Initialize the LWLock tranche for the dshash table. */ + dsh_state->tranche = LWLockNewTrancheId(); + strcpy(dsh_state->tranche_name, name); + LWLockRegisterTranche(dsh_state->tranche, dsh_state->tranche_name); + + /* Initialize the DSA for the hash table. */ + dsa = dsa_create(dsa_state->tranche); + dsa_pin(dsa); + dsa_pin_mapping(dsa); + + /* Initialize the dshash table. */ + memcpy(¶ms_copy, params, sizeof(dshash_parameters)); + params_copy.tranche_id = dsh_state->tranche; + ret = dshash_create(dsa, ¶ms_copy, NULL); + + /* Store handles for other backends to use. */ + dsa_state->handle = dsa_get_handle(dsa); + dsh_state->handle = dshash_get_hash_table_handle(ret); + } + else if (entry->type != DSMR_ENTRY_TYPE_DSH) + ereport(ERROR, + (errmsg("requested DSHash does not match type of existing entry"))); + else + { + NamedDSAState *dsa_state = &entry->data.dsh.dsa; + NamedDSHState *dsh_state = &entry->data.dsh; + dsa_area *dsa; + + /* XXX: Should we verify params matches what table was created with? */ + + if (dsa_is_attached(dsa_state->handle)) + ereport(ERROR, + (errmsg("requested DSHash already attached to current process"))); + + /* Initialize existing LWLock tranches for the DSA and dshash table. */ + LWLockRegisterTranche(dsa_state->tranche, dsa_state->tranche_name); + LWLockRegisterTranche(dsh_state->tranche, dsh_state->tranche_name); + + /* Attach to existing DSA for the hash table. */ + dsa = dsa_attach(dsa_state->handle); + dsa_pin_mapping(dsa); + + /* Attach to existing dshash table. */ + ret = dshash_attach(dsa, params, dsh_state->handle, NULL); + } + + dshash_release_lock(dsm_registry_table, entry); + MemoryContextSwitchTo(oldcontext); + + return ret; +} diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 00c76d05356..2fa045e6b0f 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -51,7 +51,6 @@ #include "storage/sinvaladt.h" #include "utils/guc.h" #include "utils/injection_point.h" -#include "utils/memutils.h" /* GUCs */ int shared_memory_type = DEFAULT_SHARED_MEMORY_TYPE; @@ -151,7 +150,6 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, InjectionPointShmemSize()); size = add_size(size, SlotSyncShmemSize()); size = add_size(size, AioShmemSize()); - size = add_size(size, MemoryContextReportingShmemSize()); /* include additional requested shmem from preload libraries */ size = add_size(size, total_addin_request); @@ -345,7 +343,6 @@ CreateOrAttachShmemStructs(void) WaitEventCustomShmemInit(); InjectionPointShmemInit(); AioShmemInit(); - MemoryContextReportingShmemInit(); } /* diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index ce69e26d720..a9bb540b55a 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -691,9 +691,6 @@ procsignal_sigusr1_handler(SIGNAL_ARGS) if (CheckProcSignal(PROCSIG_LOG_MEMORY_CONTEXT)) HandleLogMemoryContextInterrupt(); - if (CheckProcSignal(PROCSIG_GET_MEMORY_CONTEXT)) - HandleGetMemoryContextInterrupt(); - if (CheckProcSignal(PROCSIG_PARALLEL_APPLY_MESSAGE)) HandleParallelApplyMessageInterrupt(); diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index c9ae3b45b76..ca3656fc76f 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -679,12 +679,10 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) */ for (i = 0; i < shm_ent_page_count; i++) { - volatile uint64 touch pg_attribute_unused(); - page_ptrs[i] = startptr + (i * os_page_size); if (firstNumaTouch) - pg_numa_touch_mem_if_required(touch, page_ptrs[i]); + pg_numa_touch_mem_if_required(page_ptrs[i]); CHECK_FOR_INTERRUPTS(); } diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c index f50962983c3..3f6bf70bd3c 100644 --- a/src/backend/storage/lmgr/lmgr.c +++ b/src/backend/storage/lmgr/lmgr.c @@ -717,7 +717,10 @@ XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, * through, to avoid slowing down the normal case.) */ if (!first) + { + CHECK_FOR_INTERRUPTS(); pg_usleep(1000L); + } first = false; xid = SubTransGetTopmostTransaction(xid); } @@ -757,7 +760,10 @@ ConditionalXactLockTableWait(TransactionId xid, bool logLockFailure) /* See XactLockTableWait about this case */ if (!first) + { + CHECK_FOR_INTERRUPTS(); pg_usleep(1000L); + } first = false; xid = SubTransGetTopmostTransaction(xid); } diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 86b06b9223f..2776ceb295b 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -51,7 +51,7 @@ /* GUC variables */ int max_locks_per_xact; /* used to set the lock table size */ -bool log_lock_failure = false; +bool log_lock_failures = false; #define NLOCKENTS() \ mul_size(max_locks_per_xact, add_size(MaxBackends, max_prepared_xacts)) diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 5148ef982e3..46f44bc4511 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -178,8 +178,6 @@ static const char *const BuiltinTrancheNames[] = { [LWTRANCHE_XACT_SLRU] = "XactSLRU", [LWTRANCHE_PARALLEL_VACUUM_DSA] = "ParallelVacuumDSA", [LWTRANCHE_AIO_URING_COMPLETION] = "AioUringCompletion", - [LWTRANCHE_MEMORY_CONTEXT_REPORTING_STATE] = "MemoryContextReportingState", - [LWTRANCHE_MEMORY_CONTEXT_REPORTING_PROC] = "MemoryContextReportingPerProcess", }; StaticAssertDecl(lengthof(BuiltinTrancheNames) == diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index f194e6b3dcc..e9ef0fbfe32 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -50,7 +50,6 @@ #include "storage/procsignal.h" #include "storage/spin.h" #include "storage/standby.h" -#include "utils/memutils.h" #include "utils/timeout.h" #include "utils/timestamp.h" |