aboutsummaryrefslogtreecommitdiff
path: root/src/backend/storage
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage')
-rw-r--r--src/backend/storage/aio/aio.c140
-rw-r--r--src/backend/storage/aio/aio_callback.c7
-rw-r--r--src/backend/storage/aio/aio_io.c4
-rw-r--r--src/backend/storage/aio/method_io_uring.c8
-rw-r--r--src/backend/storage/aio/method_worker.c7
-rw-r--r--src/backend/storage/buffer/bufmgr.c10
-rw-r--r--src/backend/storage/buffer/localbuf.c23
-rw-r--r--src/backend/storage/file/fd.c19
-rw-r--r--src/backend/storage/ipc/dsm_registry.c265
-rw-r--r--src/backend/storage/ipc/ipci.c3
-rw-r--r--src/backend/storage/ipc/procsignal.c3
-rw-r--r--src/backend/storage/ipc/shmem.c4
-rw-r--r--src/backend/storage/lmgr/lmgr.c6
-rw-r--r--src/backend/storage/lmgr/lock.c2
-rw-r--r--src/backend/storage/lmgr/lwlock.c2
-rw-r--r--src/backend/storage/lmgr/proc.c1
16 files changed, 426 insertions, 78 deletions
diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c
index ebb5a771bfd..3643f27ad6e 100644
--- a/src/backend/storage/aio/aio.c
+++ b/src/backend/storage/aio/aio.c
@@ -184,6 +184,8 @@ pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
PgAioHandle *
pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
{
+ PgAioHandle *ioh = NULL;
+
if (pgaio_my_backend->num_staged_ios >= PGAIO_SUBMIT_BATCH_SIZE)
{
Assert(pgaio_my_backend->num_staged_ios == PGAIO_SUBMIT_BATCH_SIZE);
@@ -193,10 +195,17 @@ pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
if (pgaio_my_backend->handed_out_io)
elog(ERROR, "API violation: Only one IO can be handed out");
+ /*
+ * Probably not needed today, as interrupts should not process this IO,
+ * but...
+ */
+ HOLD_INTERRUPTS();
+
if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
{
dlist_node *ion = dclist_pop_head_node(&pgaio_my_backend->idle_ios);
- PgAioHandle *ioh = dclist_container(PgAioHandle, node, ion);
+
+ ioh = dclist_container(PgAioHandle, node, ion);
Assert(ioh->state == PGAIO_HS_IDLE);
Assert(ioh->owner_procno == MyProcNumber);
@@ -212,11 +221,11 @@ pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
ioh->report_return = ret;
ret->result.status = PGAIO_RS_UNKNOWN;
}
-
- return ioh;
}
- return NULL;
+ RESUME_INTERRUPTS();
+
+ return ioh;
}
/*
@@ -233,6 +242,12 @@ pgaio_io_release(PgAioHandle *ioh)
Assert(ioh->resowner);
pgaio_my_backend->handed_out_io = NULL;
+
+ /*
+ * Note that no interrupts are processed between the handed_out_io
+ * check and the call to reclaim - that's important as otherwise an
+ * interrupt could have already reclaimed the handle.
+ */
pgaio_io_reclaim(ioh);
}
else
@@ -251,6 +266,12 @@ pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
Assert(ioh->resowner);
+ /*
+ * Otherwise an interrupt, in the middle of releasing the IO, could end up
+ * trying to wait for the IO, leading to state confusion.
+ */
+ HOLD_INTERRUPTS();
+
ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
ioh->resowner = NULL;
@@ -291,6 +312,8 @@ pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
*/
if (ioh->report_return)
ioh->report_return = NULL;
+
+ RESUME_INTERRUPTS();
}
/*
@@ -359,6 +382,13 @@ pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
static inline void
pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
{
+ /*
+ * All callers need to have held interrupts in some form, otherwise
+ * interrupt processing could wait for the IO to complete, while in an
+ * intermediary state.
+ */
+ Assert(!INTERRUPTS_CAN_BE_PROCESSED());
+
pgaio_debug_io(DEBUG5, ioh,
"updating state to %s",
pgaio_io_state_get_name(new_state));
@@ -396,6 +426,13 @@ pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
Assert(pgaio_my_backend->handed_out_io == ioh);
Assert(pgaio_io_has_target(ioh));
+ /*
+ * Otherwise an interrupt, in the middle of staging and possibly executing
+ * the IO, could end up trying to wait for the IO, leading to state
+ * confusion.
+ */
+ HOLD_INTERRUPTS();
+
ioh->op = op;
ioh->result = 0;
@@ -435,6 +472,8 @@ pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
pgaio_io_prepare_submit(ioh);
pgaio_io_perform_synchronously(ioh);
}
+
+ RESUME_INTERRUPTS();
}
bool
@@ -517,6 +556,13 @@ bool
pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
{
*state = ioh->state;
+
+ /*
+ * Ensure that we don't see an earlier state of the handle than ioh->state
+ * due to compiler or CPU reordering. This protects both ->generation as
+ * directly used here, and other fields in the handle accessed in the
+ * caller if the handle was not reused.
+ */
pg_read_barrier();
return ioh->generation != ref_generation;
@@ -544,8 +590,8 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
&& state != PGAIO_HS_COMPLETED_SHARED
&& state != PGAIO_HS_COMPLETED_LOCAL)
{
- elog(PANIC, "waiting for own IO in wrong state: %d",
- state);
+ elog(PANIC, "waiting for own IO %d in wrong state: %s",
+ pgaio_io_get_id(ioh), pgaio_io_get_state_name(ioh));
}
}
@@ -599,7 +645,13 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
case PGAIO_HS_COMPLETED_SHARED:
case PGAIO_HS_COMPLETED_LOCAL:
- /* see above */
+
+ /*
+ * Note that no interrupts are processed between
+ * pgaio_io_was_recycled() and this check - that's important
+ * as otherwise an interrupt could have already reclaimed the
+ * handle.
+ */
if (am_owner)
pgaio_io_reclaim(ioh);
return;
@@ -610,6 +662,11 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
/*
* Make IO handle ready to be reused after IO has completed or after the
* handle has been released without being used.
+ *
+ * Note that callers need to be careful about only calling this in the right
+ * state and that no interrupts can be processed between the state check and
+ * the call to pgaio_io_reclaim(). Otherwise interrupt processing could
+ * already have reclaimed the handle.
*/
static void
pgaio_io_reclaim(PgAioHandle *ioh)
@@ -618,6 +675,9 @@ pgaio_io_reclaim(PgAioHandle *ioh)
Assert(ioh->owner_procno == MyProcNumber);
Assert(ioh->state != PGAIO_HS_IDLE);
+ /* see comment in function header */
+ HOLD_INTERRUPTS();
+
/*
* It's a bit ugly, but right now the easiest place to put the execution
* of local completion callbacks is this function, as we need to execute
@@ -685,6 +745,8 @@ pgaio_io_reclaim(PgAioHandle *ioh)
* efficient in cases where only a few IOs are used.
*/
dclist_push_head(&pgaio_my_backend->idle_ios, &ioh->node);
+
+ RESUME_INTERRUPTS();
}
/*
@@ -697,10 +759,10 @@ pgaio_io_wait_for_free(void)
{
int reclaimed = 0;
- pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %d in-flight, %d idle IOs",
+ pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %u in-flight, %u idle IOs",
pgaio_my_backend->num_staged_ios,
dclist_count(&pgaio_my_backend->in_flight_ios),
- dclist_is_empty(&pgaio_my_backend->idle_ios));
+ dclist_count(&pgaio_my_backend->idle_ios));
/*
* First check if any of our IOs actually have completed - when using
@@ -714,6 +776,16 @@ pgaio_io_wait_for_free(void)
if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
{
+ /*
+ * Note that no interrupts are processed between the state check
+ * and the call to reclaim - that's important as otherwise an
+ * interrupt could have already reclaimed the handle.
+ *
+ * Need to ensure that there's no reordering, in the more common
+ * paths, where we wait for IO, that's done by
+ * pgaio_io_was_recycled().
+ */
+ pg_read_barrier();
pgaio_io_reclaim(ioh);
reclaimed++;
}
@@ -730,13 +802,17 @@ pgaio_io_wait_for_free(void)
if (pgaio_my_backend->num_staged_ios > 0)
pgaio_submit_staged();
+ /* possibly some IOs finished during submission */
+ if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
+ return;
+
if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0)
ereport(ERROR,
errmsg_internal("no free IOs despite no in-flight IOs"),
- errdetail_internal("%d pending, %d in-flight, %d idle IOs",
+ errdetail_internal("%d pending, %u in-flight, %u idle IOs",
pgaio_my_backend->num_staged_ios,
dclist_count(&pgaio_my_backend->in_flight_ios),
- dclist_is_empty(&pgaio_my_backend->idle_ios)));
+ dclist_count(&pgaio_my_backend->idle_ios)));
/*
* Wait for the oldest in-flight IO to complete.
@@ -747,6 +823,7 @@ pgaio_io_wait_for_free(void)
{
PgAioHandle *ioh = dclist_head_element(PgAioHandle, node,
&pgaio_my_backend->in_flight_ios);
+ uint64 generation = ioh->generation;
switch (ioh->state)
{
@@ -763,20 +840,36 @@ pgaio_io_wait_for_free(void)
case PGAIO_HS_COMPLETED_IO:
case PGAIO_HS_SUBMITTED:
pgaio_debug_io(DEBUG2, ioh,
- "waiting for free io with %d in flight",
+ "waiting for free io with %u in flight",
dclist_count(&pgaio_my_backend->in_flight_ios));
/*
* In a more general case this would be racy, because the
* generation could increase after we read ioh->state above.
* But we are only looking at IOs by the current backend and
- * the IO can only be recycled by this backend.
+ * the IO can only be recycled by this backend. Even this is
+ * only OK because we get the handle's generation before
+ * potentially processing interrupts, e.g. as part of
+ * pgaio_debug_io().
*/
- pgaio_io_wait(ioh, ioh->generation);
+ pgaio_io_wait(ioh, generation);
break;
case PGAIO_HS_COMPLETED_SHARED:
- /* it's possible that another backend just finished this IO */
+
+ /*
+ * It's possible that another backend just finished this IO.
+ *
+ * Note that no interrupts are processed between the state
+ * check and the call to reclaim - that's important as
+ * otherwise an interrupt could have already reclaimed the
+ * handle.
+ *
+ * Need to ensure that there's no reordering, in the more
+ * common paths, where we wait for IO, that's done by
+ * pgaio_io_was_recycled().
+ */
+ pg_read_barrier();
pgaio_io_reclaim(ioh);
break;
}
@@ -926,6 +1019,11 @@ pgaio_wref_check_done(PgAioWaitRef *iow)
if (state == PGAIO_HS_COMPLETED_SHARED ||
state == PGAIO_HS_COMPLETED_LOCAL)
{
+ /*
+ * Note that no interrupts are processed between
+ * pgaio_io_was_recycled() and this check - that's important as
+ * otherwise an interrupt could have already reclaimed the handle.
+ */
if (am_owner)
pgaio_io_reclaim(ioh);
return true;
@@ -1153,11 +1251,14 @@ pgaio_closing_fd(int fd)
{
dlist_iter iter;
PgAioHandle *ioh = NULL;
+ uint64 generation;
dclist_foreach(iter, &pgaio_my_backend->in_flight_ios)
{
ioh = dclist_container(PgAioHandle, node, iter.cur);
+ generation = ioh->generation;
+
if (pgaio_io_uses_fd(ioh, fd))
break;
else
@@ -1168,11 +1269,11 @@ pgaio_closing_fd(int fd)
break;
pgaio_debug_io(DEBUG2, ioh,
- "waiting for IO before FD %d gets closed, %d in-flight IOs",
+ "waiting for IO before FD %d gets closed, %u in-flight IOs",
fd, dclist_count(&pgaio_my_backend->in_flight_ios));
/* see comment in pgaio_io_wait_for_free() about raciness */
- pgaio_io_wait(ioh, ioh->generation);
+ pgaio_io_wait(ioh, generation);
}
}
}
@@ -1201,13 +1302,14 @@ pgaio_shutdown(int code, Datum arg)
while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
{
PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios);
+ uint64 generation = ioh->generation;
pgaio_debug_io(DEBUG2, ioh,
- "waiting for IO to complete during shutdown, %d in-flight IOs",
+ "waiting for IO to complete during shutdown, %u in-flight IOs",
dclist_count(&pgaio_my_backend->in_flight_ios));
/* see comment in pgaio_io_wait_for_free() about raciness */
- pgaio_io_wait(ioh, ioh->generation);
+ pgaio_io_wait(ioh, generation);
}
pgaio_my_backend = NULL;
diff --git a/src/backend/storage/aio/aio_callback.c b/src/backend/storage/aio/aio_callback.c
index 0ad9795bb7e..03c9bba0802 100644
--- a/src/backend/storage/aio/aio_callback.c
+++ b/src/backend/storage/aio/aio_callback.c
@@ -256,6 +256,9 @@ pgaio_io_call_complete_shared(PgAioHandle *ioh)
pgaio_result_status_string(result.status),
result.id, result.error_data, result.result);
result = ce->cb->complete_shared(ioh, result, cb_data);
+
+ /* the callback should never transition to unknown */
+ Assert(result.status != PGAIO_RS_UNKNOWN);
}
ioh->distilled_result = result;
@@ -290,6 +293,7 @@ pgaio_io_call_complete_local(PgAioHandle *ioh)
/* start with distilled result from shared callback */
result = ioh->distilled_result;
+ Assert(result.status != PGAIO_RS_UNKNOWN);
for (int i = ioh->num_callbacks; i > 0; i--)
{
@@ -306,6 +310,9 @@ pgaio_io_call_complete_local(PgAioHandle *ioh)
pgaio_result_status_string(result.status),
result.id, result.error_data, result.result);
result = ce->cb->complete_local(ioh, result, cb_data);
+
+ /* the callback should never transition to unknown */
+ Assert(result.status != PGAIO_RS_UNKNOWN);
}
/*
diff --git a/src/backend/storage/aio/aio_io.c b/src/backend/storage/aio/aio_io.c
index 00e176135a6..520b5077df2 100644
--- a/src/backend/storage/aio/aio_io.c
+++ b/src/backend/storage/aio/aio_io.c
@@ -181,9 +181,9 @@ pgaio_io_get_op_name(PgAioHandle *ioh)
case PGAIO_OP_INVALID:
return "invalid";
case PGAIO_OP_READV:
- return "read";
+ return "readv";
case PGAIO_OP_WRITEV:
- return "write";
+ return "writev";
}
return NULL; /* silence compiler */
diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c
index c719ba2727a..b78048328e1 100644
--- a/src/backend/storage/aio/method_io_uring.c
+++ b/src/backend/storage/aio/method_io_uring.c
@@ -126,7 +126,7 @@ pgaio_uring_shmem_size(void)
static void
pgaio_uring_shmem_init(bool first_time)
{
- int TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS;
+ int TotalProcs = pgaio_uring_procs();
bool found;
pgaio_uring_contexts = (PgAioUringContext *)
@@ -400,9 +400,9 @@ pgaio_uring_wait_one(PgAioHandle *ioh, uint64 ref_generation)
while (true)
{
pgaio_debug_io(DEBUG3, ioh,
- "wait_one io_gen: %llu, ref_gen: %llu, cycle %d",
- (long long unsigned) ioh->generation,
- (long long unsigned) ref_generation,
+ "wait_one io_gen: %" PRIu64 ", ref_gen: %" PRIu64 ", cycle %d",
+ ioh->generation,
+ ref_generation,
waited);
if (pgaio_io_was_recycled(ioh, ref_generation, &state) ||
diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c
index 743cccc2acd..36be179678d 100644
--- a/src/backend/storage/aio/method_worker.c
+++ b/src/backend/storage/aio/method_worker.c
@@ -461,7 +461,12 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
int nwakeups = 0;
int worker;
- /* Try to get a job to do. */
+ /*
+ * Try to get a job to do.
+ *
+ * The lwlock acquisition also provides the necessary memory barrier
+ * to ensure that we don't see an outdated data in the handle.
+ */
LWLockAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE);
if ((io_index = pgaio_worker_submission_queue_consume()) == UINT32_MAX)
{
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f93131a645e..bd68d7e0ca9 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -4550,11 +4550,9 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
if (RelFileLocatorBackendIsTemp(rlocator))
{
if (rlocator.backend == MyProcNumber)
- {
- for (j = 0; j < nforks; j++)
- DropRelationLocalBuffers(rlocator.locator, forkNum[j],
- firstDelBlock[j]);
- }
+ DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
+ firstDelBlock);
+
return;
}
@@ -7320,7 +7318,7 @@ buffer_readv_report(PgAioResult result, const PgAioTargetData *td,
affected_count > 1 ?
errdetail("Block %u held first zeroed page.",
first + first_off) : 0,
- errhint("See server log for details about the other %u invalid block(s).",
+ errhint("See server log for details about the other %d invalid block(s).",
affected_count + checkfail_count - 1));
return;
}
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 63101d56a07..3da9c41ee1d 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -629,7 +629,7 @@ InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced)
*/
if (check_unreferenced &&
(LocalRefCount[bufid] != 0 || BUF_STATE_GET_REFCOUNT(buf_state) != 0))
- elog(ERROR, "block %u of %s is still referenced (local %u)",
+ elog(ERROR, "block %u of %s is still referenced (local %d)",
bufHdr->tag.blockNum,
relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
MyProcNumber,
@@ -660,10 +660,11 @@ InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced)
* See DropRelationBuffers in bufmgr.c for more notes.
*/
void
-DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum,
- BlockNumber firstDelBlock)
+DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum,
+ int nforks, BlockNumber *firstDelBlock)
{
int i;
+ int j;
for (i = 0; i < NLocBuffer; i++)
{
@@ -672,12 +673,18 @@ DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum,
buf_state = pg_atomic_read_u32(&bufHdr->state);
- if ((buf_state & BM_TAG_VALID) &&
- BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
- BufTagGetForkNum(&bufHdr->tag) == forkNum &&
- bufHdr->tag.blockNum >= firstDelBlock)
+ if (!(buf_state & BM_TAG_VALID) ||
+ !BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator))
+ continue;
+
+ for (j = 0; j < nforks; j++)
{
- InvalidateLocalBuffer(bufHdr, true);
+ if (BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
+ bufHdr->tag.blockNum >= firstDelBlock[j])
+ {
+ InvalidateLocalBuffer(bufHdr, true);
+ break;
+ }
}
}
}
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 0e8299dd556..a4ec7959f31 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -400,25 +400,22 @@ pg_fsync(int fd)
* portable, even if it runs ok on the current system.
*
* We assert here that a descriptor for a file was opened with write
- * permissions (either O_RDWR or O_WRONLY) and for a directory without
- * write permissions (O_RDONLY).
+ * permissions (i.e., not O_RDONLY) and for a directory without write
+ * permissions (O_RDONLY). Notice that the assertion check is made even
+ * if fsync() is disabled.
*
- * Ignore any fstat errors and let the follow-up fsync() do its work.
- * Doing this sanity check here counts for the case where fsync() is
- * disabled.
+ * If fstat() fails, ignore it and let the follow-up fsync() complain.
*/
if (fstat(fd, &st) == 0)
{
int desc_flags = fcntl(fd, F_GETFL);
- /*
- * O_RDONLY is historically 0, so just make sure that for directories
- * no write flags are used.
- */
+ desc_flags &= O_ACCMODE;
+
if (S_ISDIR(st.st_mode))
- Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
+ Assert(desc_flags == O_RDONLY);
else
- Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
+ Assert(desc_flags != O_RDONLY);
}
errno = 0;
#endif
diff --git a/src/backend/storage/ipc/dsm_registry.c b/src/backend/storage/ipc/dsm_registry.c
index 1d4fd31ffed..828c2ff0c7f 100644
--- a/src/backend/storage/ipc/dsm_registry.c
+++ b/src/backend/storage/ipc/dsm_registry.c
@@ -15,6 +15,20 @@
* current backend. This function guarantees that only one backend
* initializes the segment and that all other backends just attach it.
*
+ * A DSA can be created in or retrieved from the registry by calling
+ * GetNamedDSA(). As with GetNamedDSMSegment(), if a DSA with the provided
+ * name does not yet exist, it is created. Otherwise, GetNamedDSA()
+ * ensures the DSA is attached to the current backend. This function
+ * guarantees that only one backend initializes the DSA and that all other
+ * backends just attach it.
+ *
+ * A dshash table can be created in or retrieved from the registry by
+ * calling GetNamedDSHash(). As with GetNamedDSMSegment(), if a hash
+ * table with the provided name does not yet exist, it is created.
+ * Otherwise, GetNamedDSHash() ensures the hash table is attached to the
+ * current backend. This function guarantees that only one backend
+ * initializes the table and that all other backends just attach it.
+ *
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -32,6 +46,12 @@
#include "storage/shmem.h"
#include "utils/memutils.h"
+#define DSMR_NAME_LEN 128
+
+#define DSMR_DSA_TRANCHE_SUFFIX " DSA"
+#define DSMR_DSA_TRANCHE_SUFFIX_LEN (sizeof(DSMR_DSA_TRANCHE_SUFFIX) - 1)
+#define DSMR_DSA_TRANCHE_NAME_LEN (DSMR_NAME_LEN + DSMR_DSA_TRANCHE_SUFFIX_LEN)
+
typedef struct DSMRegistryCtxStruct
{
dsa_handle dsah;
@@ -40,15 +60,48 @@ typedef struct DSMRegistryCtxStruct
static DSMRegistryCtxStruct *DSMRegistryCtx;
-typedef struct DSMRegistryEntry
+typedef struct NamedDSMState
{
- char name[64];
dsm_handle handle;
size_t size;
+} NamedDSMState;
+
+typedef struct NamedDSAState
+{
+ dsa_handle handle;
+ int tranche;
+ char tranche_name[DSMR_DSA_TRANCHE_NAME_LEN];
+} NamedDSAState;
+
+typedef struct NamedDSHState
+{
+ NamedDSAState dsa;
+ dshash_table_handle handle;
+ int tranche;
+ char tranche_name[DSMR_NAME_LEN];
+} NamedDSHState;
+
+typedef enum DSMREntryType
+{
+ DSMR_ENTRY_TYPE_DSM,
+ DSMR_ENTRY_TYPE_DSA,
+ DSMR_ENTRY_TYPE_DSH,
+} DSMREntryType;
+
+typedef struct DSMRegistryEntry
+{
+ char name[DSMR_NAME_LEN];
+ DSMREntryType type;
+ union
+ {
+ NamedDSMState dsm;
+ NamedDSAState dsa;
+ NamedDSHState dsh;
+ } data;
} DSMRegistryEntry;
static const dshash_parameters dsh_params = {
- offsetof(DSMRegistryEntry, handle),
+ offsetof(DSMRegistryEntry, type),
sizeof(DSMRegistryEntry),
dshash_strcmp,
dshash_strhash,
@@ -141,7 +194,7 @@ GetNamedDSMSegment(const char *name, size_t size,
ereport(ERROR,
(errmsg("DSM segment name cannot be empty")));
- if (strlen(name) >= offsetof(DSMRegistryEntry, handle))
+ if (strlen(name) >= offsetof(DSMRegistryEntry, type))
ereport(ERROR,
(errmsg("DSM segment name too long")));
@@ -158,32 +211,39 @@ GetNamedDSMSegment(const char *name, size_t size,
entry = dshash_find_or_insert(dsm_registry_table, name, found);
if (!(*found))
{
+ NamedDSMState *state = &entry->data.dsm;
+ dsm_segment *seg;
+
+ entry->type = DSMR_ENTRY_TYPE_DSM;
+
/* Initialize the segment. */
- dsm_segment *seg = dsm_create(size, 0);
+ seg = dsm_create(size, 0);
dsm_pin_segment(seg);
dsm_pin_mapping(seg);
- entry->handle = dsm_segment_handle(seg);
- entry->size = size;
+ state->handle = dsm_segment_handle(seg);
+ state->size = size;
ret = dsm_segment_address(seg);
if (init_callback)
(*init_callback) (ret);
}
- else if (entry->size != size)
- {
+ else if (entry->type != DSMR_ENTRY_TYPE_DSM)
ereport(ERROR,
- (errmsg("requested DSM segment size does not match size of "
- "existing segment")));
- }
+ (errmsg("requested DSM segment does not match type of existing entry")));
+ else if (entry->data.dsm.size != size)
+ ereport(ERROR,
+ (errmsg("requested DSM segment size does not match size of existing segment")));
else
{
- dsm_segment *seg = dsm_find_mapping(entry->handle);
+ NamedDSMState *state = &entry->data.dsm;
+ dsm_segment *seg;
/* If the existing segment is not already attached, attach it now. */
+ seg = dsm_find_mapping(state->handle);
if (seg == NULL)
{
- seg = dsm_attach(entry->handle);
+ seg = dsm_attach(state->handle);
if (seg == NULL)
elog(ERROR, "could not map dynamic shared memory segment");
@@ -198,3 +258,180 @@ GetNamedDSMSegment(const char *name, size_t size,
return ret;
}
+
+/*
+ * Initialize or attach a named DSA.
+ *
+ * This routine returns a pointer to the DSA. A new LWLock tranche ID will be
+ * generated if needed. Note that the lock tranche will be registered with the
+ * provided name. Also note that this should be called at most once for a
+ * given DSA in each backend.
+ */
+dsa_area *
+GetNamedDSA(const char *name, bool *found)
+{
+ DSMRegistryEntry *entry;
+ MemoryContext oldcontext;
+ dsa_area *ret;
+
+ Assert(found);
+
+ if (!name || *name == '\0')
+ ereport(ERROR,
+ (errmsg("DSA name cannot be empty")));
+
+ if (strlen(name) >= offsetof(DSMRegistryEntry, type))
+ ereport(ERROR,
+ (errmsg("DSA name too long")));
+
+ /* Be sure any local memory allocated by DSM/DSA routines is persistent. */
+ oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /* Connect to the registry. */
+ init_dsm_registry();
+
+ entry = dshash_find_or_insert(dsm_registry_table, name, found);
+ if (!(*found))
+ {
+ NamedDSAState *state = &entry->data.dsa;
+
+ entry->type = DSMR_ENTRY_TYPE_DSA;
+
+ /* Initialize the LWLock tranche for the DSA. */
+ state->tranche = LWLockNewTrancheId();
+ strcpy(state->tranche_name, name);
+ LWLockRegisterTranche(state->tranche, state->tranche_name);
+
+ /* Initialize the DSA. */
+ ret = dsa_create(state->tranche);
+ dsa_pin(ret);
+ dsa_pin_mapping(ret);
+
+ /* Store handle for other backends to use. */
+ state->handle = dsa_get_handle(ret);
+ }
+ else if (entry->type != DSMR_ENTRY_TYPE_DSA)
+ ereport(ERROR,
+ (errmsg("requested DSA does not match type of existing entry")));
+ else
+ {
+ NamedDSAState *state = &entry->data.dsa;
+
+ if (dsa_is_attached(state->handle))
+ ereport(ERROR,
+ (errmsg("requested DSA already attached to current process")));
+
+ /* Initialize existing LWLock tranche for the DSA. */
+ LWLockRegisterTranche(state->tranche, state->tranche_name);
+
+ /* Attach to existing DSA. */
+ ret = dsa_attach(state->handle);
+ dsa_pin_mapping(ret);
+ }
+
+ dshash_release_lock(dsm_registry_table, entry);
+ MemoryContextSwitchTo(oldcontext);
+
+ return ret;
+}
+
+/*
+ * Initialize or attach a named dshash table.
+ *
+ * This routine returns the address of the table. The tranche_id member of
+ * params is ignored; new tranche IDs will be generated if needed. Note that
+ * the DSA lock tranche will be registered with the provided name with " DSA"
+ * appended. The dshash lock tranche will be registered with the provided
+ * name. Also note that this should be called at most once for a given table
+ * in each backend.
+ */
+dshash_table *
+GetNamedDSHash(const char *name, const dshash_parameters *params, bool *found)
+{
+ DSMRegistryEntry *entry;
+ MemoryContext oldcontext;
+ dshash_table *ret;
+
+ Assert(params);
+ Assert(found);
+
+ if (!name || *name == '\0')
+ ereport(ERROR,
+ (errmsg("DSHash name cannot be empty")));
+
+ if (strlen(name) >= offsetof(DSMRegistryEntry, type))
+ ereport(ERROR,
+ (errmsg("DSHash name too long")));
+
+ /* Be sure any local memory allocated by DSM/DSA routines is persistent. */
+ oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /* Connect to the registry. */
+ init_dsm_registry();
+
+ entry = dshash_find_or_insert(dsm_registry_table, name, found);
+ if (!(*found))
+ {
+ NamedDSAState *dsa_state = &entry->data.dsh.dsa;
+ NamedDSHState *dsh_state = &entry->data.dsh;
+ dshash_parameters params_copy;
+ dsa_area *dsa;
+
+ entry->type = DSMR_ENTRY_TYPE_DSH;
+
+ /* Initialize the LWLock tranche for the DSA. */
+ dsa_state->tranche = LWLockNewTrancheId();
+ sprintf(dsa_state->tranche_name, "%s%s", name, DSMR_DSA_TRANCHE_SUFFIX);
+ LWLockRegisterTranche(dsa_state->tranche, dsa_state->tranche_name);
+
+ /* Initialize the LWLock tranche for the dshash table. */
+ dsh_state->tranche = LWLockNewTrancheId();
+ strcpy(dsh_state->tranche_name, name);
+ LWLockRegisterTranche(dsh_state->tranche, dsh_state->tranche_name);
+
+ /* Initialize the DSA for the hash table. */
+ dsa = dsa_create(dsa_state->tranche);
+ dsa_pin(dsa);
+ dsa_pin_mapping(dsa);
+
+ /* Initialize the dshash table. */
+ memcpy(&params_copy, params, sizeof(dshash_parameters));
+ params_copy.tranche_id = dsh_state->tranche;
+ ret = dshash_create(dsa, &params_copy, NULL);
+
+ /* Store handles for other backends to use. */
+ dsa_state->handle = dsa_get_handle(dsa);
+ dsh_state->handle = dshash_get_hash_table_handle(ret);
+ }
+ else if (entry->type != DSMR_ENTRY_TYPE_DSH)
+ ereport(ERROR,
+ (errmsg("requested DSHash does not match type of existing entry")));
+ else
+ {
+ NamedDSAState *dsa_state = &entry->data.dsh.dsa;
+ NamedDSHState *dsh_state = &entry->data.dsh;
+ dsa_area *dsa;
+
+ /* XXX: Should we verify params matches what table was created with? */
+
+ if (dsa_is_attached(dsa_state->handle))
+ ereport(ERROR,
+ (errmsg("requested DSHash already attached to current process")));
+
+ /* Initialize existing LWLock tranches for the DSA and dshash table. */
+ LWLockRegisterTranche(dsa_state->tranche, dsa_state->tranche_name);
+ LWLockRegisterTranche(dsh_state->tranche, dsh_state->tranche_name);
+
+ /* Attach to existing DSA for the hash table. */
+ dsa = dsa_attach(dsa_state->handle);
+ dsa_pin_mapping(dsa);
+
+ /* Attach to existing dshash table. */
+ ret = dshash_attach(dsa, params, dsh_state->handle, NULL);
+ }
+
+ dshash_release_lock(dsm_registry_table, entry);
+ MemoryContextSwitchTo(oldcontext);
+
+ return ret;
+}
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 00c76d05356..2fa045e6b0f 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -51,7 +51,6 @@
#include "storage/sinvaladt.h"
#include "utils/guc.h"
#include "utils/injection_point.h"
-#include "utils/memutils.h"
/* GUCs */
int shared_memory_type = DEFAULT_SHARED_MEMORY_TYPE;
@@ -151,7 +150,6 @@ CalculateShmemSize(int *num_semaphores)
size = add_size(size, InjectionPointShmemSize());
size = add_size(size, SlotSyncShmemSize());
size = add_size(size, AioShmemSize());
- size = add_size(size, MemoryContextReportingShmemSize());
/* include additional requested shmem from preload libraries */
size = add_size(size, total_addin_request);
@@ -345,7 +343,6 @@ CreateOrAttachShmemStructs(void)
WaitEventCustomShmemInit();
InjectionPointShmemInit();
AioShmemInit();
- MemoryContextReportingShmemInit();
}
/*
diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
index ce69e26d720..a9bb540b55a 100644
--- a/src/backend/storage/ipc/procsignal.c
+++ b/src/backend/storage/ipc/procsignal.c
@@ -691,9 +691,6 @@ procsignal_sigusr1_handler(SIGNAL_ARGS)
if (CheckProcSignal(PROCSIG_LOG_MEMORY_CONTEXT))
HandleLogMemoryContextInterrupt();
- if (CheckProcSignal(PROCSIG_GET_MEMORY_CONTEXT))
- HandleGetMemoryContextInterrupt();
-
if (CheckProcSignal(PROCSIG_PARALLEL_APPLY_MESSAGE))
HandleParallelApplyMessageInterrupt();
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index c9ae3b45b76..ca3656fc76f 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -679,12 +679,10 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
*/
for (i = 0; i < shm_ent_page_count; i++)
{
- volatile uint64 touch pg_attribute_unused();
-
page_ptrs[i] = startptr + (i * os_page_size);
if (firstNumaTouch)
- pg_numa_touch_mem_if_required(touch, page_ptrs[i]);
+ pg_numa_touch_mem_if_required(page_ptrs[i]);
CHECK_FOR_INTERRUPTS();
}
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
index f50962983c3..3f6bf70bd3c 100644
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -717,7 +717,10 @@ XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid,
* through, to avoid slowing down the normal case.)
*/
if (!first)
+ {
+ CHECK_FOR_INTERRUPTS();
pg_usleep(1000L);
+ }
first = false;
xid = SubTransGetTopmostTransaction(xid);
}
@@ -757,7 +760,10 @@ ConditionalXactLockTableWait(TransactionId xid, bool logLockFailure)
/* See XactLockTableWait about this case */
if (!first)
+ {
+ CHECK_FOR_INTERRUPTS();
pg_usleep(1000L);
+ }
first = false;
xid = SubTransGetTopmostTransaction(xid);
}
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 86b06b9223f..2776ceb295b 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -51,7 +51,7 @@
/* GUC variables */
int max_locks_per_xact; /* used to set the lock table size */
-bool log_lock_failure = false;
+bool log_lock_failures = false;
#define NLOCKENTS() \
mul_size(max_locks_per_xact, add_size(MaxBackends, max_prepared_xacts))
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 5148ef982e3..46f44bc4511 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -178,8 +178,6 @@ static const char *const BuiltinTrancheNames[] = {
[LWTRANCHE_XACT_SLRU] = "XactSLRU",
[LWTRANCHE_PARALLEL_VACUUM_DSA] = "ParallelVacuumDSA",
[LWTRANCHE_AIO_URING_COMPLETION] = "AioUringCompletion",
- [LWTRANCHE_MEMORY_CONTEXT_REPORTING_STATE] = "MemoryContextReportingState",
- [LWTRANCHE_MEMORY_CONTEXT_REPORTING_PROC] = "MemoryContextReportingPerProcess",
};
StaticAssertDecl(lengthof(BuiltinTrancheNames) ==
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index f194e6b3dcc..e9ef0fbfe32 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -50,7 +50,6 @@
#include "storage/procsignal.h"
#include "storage/spin.h"
#include "storage/standby.h"
-#include "utils/memutils.h"
#include "utils/timeout.h"
#include "utils/timestamp.h"