aboutsummaryrefslogtreecommitdiff
path: root/src/backend/storage/aio
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage/aio')
-rw-r--r--src/backend/storage/aio/aio.c140
-rw-r--r--src/backend/storage/aio/aio_callback.c7
-rw-r--r--src/backend/storage/aio/aio_io.c4
-rw-r--r--src/backend/storage/aio/method_io_uring.c8
-rw-r--r--src/backend/storage/aio/method_worker.c7
5 files changed, 140 insertions, 26 deletions
diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c
index ebb5a771bfd..3643f27ad6e 100644
--- a/src/backend/storage/aio/aio.c
+++ b/src/backend/storage/aio/aio.c
@@ -184,6 +184,8 @@ pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
PgAioHandle *
pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
{
+ PgAioHandle *ioh = NULL;
+
if (pgaio_my_backend->num_staged_ios >= PGAIO_SUBMIT_BATCH_SIZE)
{
Assert(pgaio_my_backend->num_staged_ios == PGAIO_SUBMIT_BATCH_SIZE);
@@ -193,10 +195,17 @@ pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
if (pgaio_my_backend->handed_out_io)
elog(ERROR, "API violation: Only one IO can be handed out");
+ /*
+ * Probably not needed today, as interrupts should not process this IO,
+ * but...
+ */
+ HOLD_INTERRUPTS();
+
if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
{
dlist_node *ion = dclist_pop_head_node(&pgaio_my_backend->idle_ios);
- PgAioHandle *ioh = dclist_container(PgAioHandle, node, ion);
+
+ ioh = dclist_container(PgAioHandle, node, ion);
Assert(ioh->state == PGAIO_HS_IDLE);
Assert(ioh->owner_procno == MyProcNumber);
@@ -212,11 +221,11 @@ pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
ioh->report_return = ret;
ret->result.status = PGAIO_RS_UNKNOWN;
}
-
- return ioh;
}
- return NULL;
+ RESUME_INTERRUPTS();
+
+ return ioh;
}
/*
@@ -233,6 +242,12 @@ pgaio_io_release(PgAioHandle *ioh)
Assert(ioh->resowner);
pgaio_my_backend->handed_out_io = NULL;
+
+ /*
+ * Note that no interrupts are processed between the handed_out_io
+ * check and the call to reclaim - that's important as otherwise an
+ * interrupt could have already reclaimed the handle.
+ */
pgaio_io_reclaim(ioh);
}
else
@@ -251,6 +266,12 @@ pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
Assert(ioh->resowner);
+ /*
+ * Otherwise an interrupt, in the middle of releasing the IO, could end up
+ * trying to wait for the IO, leading to state confusion.
+ */
+ HOLD_INTERRUPTS();
+
ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
ioh->resowner = NULL;
@@ -291,6 +312,8 @@ pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
*/
if (ioh->report_return)
ioh->report_return = NULL;
+
+ RESUME_INTERRUPTS();
}
/*
@@ -359,6 +382,13 @@ pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
static inline void
pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
{
+ /*
+ * All callers need to have held interrupts in some form, otherwise
+ * interrupt processing could wait for the IO to complete, while in an
+ * intermediary state.
+ */
+ Assert(!INTERRUPTS_CAN_BE_PROCESSED());
+
pgaio_debug_io(DEBUG5, ioh,
"updating state to %s",
pgaio_io_state_get_name(new_state));
@@ -396,6 +426,13 @@ pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
Assert(pgaio_my_backend->handed_out_io == ioh);
Assert(pgaio_io_has_target(ioh));
+ /*
+ * Otherwise an interrupt, in the middle of staging and possibly executing
+ * the IO, could end up trying to wait for the IO, leading to state
+ * confusion.
+ */
+ HOLD_INTERRUPTS();
+
ioh->op = op;
ioh->result = 0;
@@ -435,6 +472,8 @@ pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
pgaio_io_prepare_submit(ioh);
pgaio_io_perform_synchronously(ioh);
}
+
+ RESUME_INTERRUPTS();
}
bool
@@ -517,6 +556,13 @@ bool
pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
{
*state = ioh->state;
+
+ /*
+ * Ensure that we don't see an earlier state of the handle than ioh->state
+ * due to compiler or CPU reordering. This protects both ->generation as
+ * directly used here, and other fields in the handle accessed in the
+ * caller if the handle was not reused.
+ */
pg_read_barrier();
return ioh->generation != ref_generation;
@@ -544,8 +590,8 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
&& state != PGAIO_HS_COMPLETED_SHARED
&& state != PGAIO_HS_COMPLETED_LOCAL)
{
- elog(PANIC, "waiting for own IO in wrong state: %d",
- state);
+ elog(PANIC, "waiting for own IO %d in wrong state: %s",
+ pgaio_io_get_id(ioh), pgaio_io_get_state_name(ioh));
}
}
@@ -599,7 +645,13 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
case PGAIO_HS_COMPLETED_SHARED:
case PGAIO_HS_COMPLETED_LOCAL:
- /* see above */
+
+ /*
+ * Note that no interrupts are processed between
+ * pgaio_io_was_recycled() and this check - that's important
+ * as otherwise an interrupt could have already reclaimed the
+ * handle.
+ */
if (am_owner)
pgaio_io_reclaim(ioh);
return;
@@ -610,6 +662,11 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
/*
* Make IO handle ready to be reused after IO has completed or after the
* handle has been released without being used.
+ *
+ * Note that callers need to be careful about only calling this in the right
+ * state and that no interrupts can be processed between the state check and
+ * the call to pgaio_io_reclaim(). Otherwise interrupt processing could
+ * already have reclaimed the handle.
*/
static void
pgaio_io_reclaim(PgAioHandle *ioh)
@@ -618,6 +675,9 @@ pgaio_io_reclaim(PgAioHandle *ioh)
Assert(ioh->owner_procno == MyProcNumber);
Assert(ioh->state != PGAIO_HS_IDLE);
+ /* see comment in function header */
+ HOLD_INTERRUPTS();
+
/*
* It's a bit ugly, but right now the easiest place to put the execution
* of local completion callbacks is this function, as we need to execute
@@ -685,6 +745,8 @@ pgaio_io_reclaim(PgAioHandle *ioh)
* efficient in cases where only a few IOs are used.
*/
dclist_push_head(&pgaio_my_backend->idle_ios, &ioh->node);
+
+ RESUME_INTERRUPTS();
}
/*
@@ -697,10 +759,10 @@ pgaio_io_wait_for_free(void)
{
int reclaimed = 0;
- pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %d in-flight, %d idle IOs",
+ pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %u in-flight, %u idle IOs",
pgaio_my_backend->num_staged_ios,
dclist_count(&pgaio_my_backend->in_flight_ios),
- dclist_is_empty(&pgaio_my_backend->idle_ios));
+ dclist_count(&pgaio_my_backend->idle_ios));
/*
* First check if any of our IOs actually have completed - when using
@@ -714,6 +776,16 @@ pgaio_io_wait_for_free(void)
if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
{
+ /*
+ * Note that no interrupts are processed between the state check
+ * and the call to reclaim - that's important as otherwise an
+ * interrupt could have already reclaimed the handle.
+ *
+ * Need to ensure that there's no reordering, in the more common
+ * paths, where we wait for IO, that's done by
+ * pgaio_io_was_recycled().
+ */
+ pg_read_barrier();
pgaio_io_reclaim(ioh);
reclaimed++;
}
@@ -730,13 +802,17 @@ pgaio_io_wait_for_free(void)
if (pgaio_my_backend->num_staged_ios > 0)
pgaio_submit_staged();
+ /* possibly some IOs finished during submission */
+ if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
+ return;
+
if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0)
ereport(ERROR,
errmsg_internal("no free IOs despite no in-flight IOs"),
- errdetail_internal("%d pending, %d in-flight, %d idle IOs",
+ errdetail_internal("%d pending, %u in-flight, %u idle IOs",
pgaio_my_backend->num_staged_ios,
dclist_count(&pgaio_my_backend->in_flight_ios),
- dclist_is_empty(&pgaio_my_backend->idle_ios)));
+ dclist_count(&pgaio_my_backend->idle_ios)));
/*
* Wait for the oldest in-flight IO to complete.
@@ -747,6 +823,7 @@ pgaio_io_wait_for_free(void)
{
PgAioHandle *ioh = dclist_head_element(PgAioHandle, node,
&pgaio_my_backend->in_flight_ios);
+ uint64 generation = ioh->generation;
switch (ioh->state)
{
@@ -763,20 +840,36 @@ pgaio_io_wait_for_free(void)
case PGAIO_HS_COMPLETED_IO:
case PGAIO_HS_SUBMITTED:
pgaio_debug_io(DEBUG2, ioh,
- "waiting for free io with %d in flight",
+ "waiting for free io with %u in flight",
dclist_count(&pgaio_my_backend->in_flight_ios));
/*
* In a more general case this would be racy, because the
* generation could increase after we read ioh->state above.
* But we are only looking at IOs by the current backend and
- * the IO can only be recycled by this backend.
+ * the IO can only be recycled by this backend. Even this is
+ * only OK because we get the handle's generation before
+ * potentially processing interrupts, e.g. as part of
+ * pgaio_debug_io().
*/
- pgaio_io_wait(ioh, ioh->generation);
+ pgaio_io_wait(ioh, generation);
break;
case PGAIO_HS_COMPLETED_SHARED:
- /* it's possible that another backend just finished this IO */
+
+ /*
+ * It's possible that another backend just finished this IO.
+ *
+ * Note that no interrupts are processed between the state
+ * check and the call to reclaim - that's important as
+ * otherwise an interrupt could have already reclaimed the
+ * handle.
+ *
+ * Need to ensure that there's no reordering, in the more
+ * common paths, where we wait for IO, that's done by
+ * pgaio_io_was_recycled().
+ */
+ pg_read_barrier();
pgaio_io_reclaim(ioh);
break;
}
@@ -926,6 +1019,11 @@ pgaio_wref_check_done(PgAioWaitRef *iow)
if (state == PGAIO_HS_COMPLETED_SHARED ||
state == PGAIO_HS_COMPLETED_LOCAL)
{
+ /*
+ * Note that no interrupts are processed between
+ * pgaio_io_was_recycled() and this check - that's important as
+ * otherwise an interrupt could have already reclaimed the handle.
+ */
if (am_owner)
pgaio_io_reclaim(ioh);
return true;
@@ -1153,11 +1251,14 @@ pgaio_closing_fd(int fd)
{
dlist_iter iter;
PgAioHandle *ioh = NULL;
+ uint64 generation;
dclist_foreach(iter, &pgaio_my_backend->in_flight_ios)
{
ioh = dclist_container(PgAioHandle, node, iter.cur);
+ generation = ioh->generation;
+
if (pgaio_io_uses_fd(ioh, fd))
break;
else
@@ -1168,11 +1269,11 @@ pgaio_closing_fd(int fd)
break;
pgaio_debug_io(DEBUG2, ioh,
- "waiting for IO before FD %d gets closed, %d in-flight IOs",
+ "waiting for IO before FD %d gets closed, %u in-flight IOs",
fd, dclist_count(&pgaio_my_backend->in_flight_ios));
/* see comment in pgaio_io_wait_for_free() about raciness */
- pgaio_io_wait(ioh, ioh->generation);
+ pgaio_io_wait(ioh, generation);
}
}
}
@@ -1201,13 +1302,14 @@ pgaio_shutdown(int code, Datum arg)
while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
{
PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios);
+ uint64 generation = ioh->generation;
pgaio_debug_io(DEBUG2, ioh,
- "waiting for IO to complete during shutdown, %d in-flight IOs",
+ "waiting for IO to complete during shutdown, %u in-flight IOs",
dclist_count(&pgaio_my_backend->in_flight_ios));
/* see comment in pgaio_io_wait_for_free() about raciness */
- pgaio_io_wait(ioh, ioh->generation);
+ pgaio_io_wait(ioh, generation);
}
pgaio_my_backend = NULL;
diff --git a/src/backend/storage/aio/aio_callback.c b/src/backend/storage/aio/aio_callback.c
index 0ad9795bb7e..03c9bba0802 100644
--- a/src/backend/storage/aio/aio_callback.c
+++ b/src/backend/storage/aio/aio_callback.c
@@ -256,6 +256,9 @@ pgaio_io_call_complete_shared(PgAioHandle *ioh)
pgaio_result_status_string(result.status),
result.id, result.error_data, result.result);
result = ce->cb->complete_shared(ioh, result, cb_data);
+
+ /* the callback should never transition to unknown */
+ Assert(result.status != PGAIO_RS_UNKNOWN);
}
ioh->distilled_result = result;
@@ -290,6 +293,7 @@ pgaio_io_call_complete_local(PgAioHandle *ioh)
/* start with distilled result from shared callback */
result = ioh->distilled_result;
+ Assert(result.status != PGAIO_RS_UNKNOWN);
for (int i = ioh->num_callbacks; i > 0; i--)
{
@@ -306,6 +310,9 @@ pgaio_io_call_complete_local(PgAioHandle *ioh)
pgaio_result_status_string(result.status),
result.id, result.error_data, result.result);
result = ce->cb->complete_local(ioh, result, cb_data);
+
+ /* the callback should never transition to unknown */
+ Assert(result.status != PGAIO_RS_UNKNOWN);
}
/*
diff --git a/src/backend/storage/aio/aio_io.c b/src/backend/storage/aio/aio_io.c
index 00e176135a6..520b5077df2 100644
--- a/src/backend/storage/aio/aio_io.c
+++ b/src/backend/storage/aio/aio_io.c
@@ -181,9 +181,9 @@ pgaio_io_get_op_name(PgAioHandle *ioh)
case PGAIO_OP_INVALID:
return "invalid";
case PGAIO_OP_READV:
- return "read";
+ return "readv";
case PGAIO_OP_WRITEV:
- return "write";
+ return "writev";
}
return NULL; /* silence compiler */
diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c
index c719ba2727a..b78048328e1 100644
--- a/src/backend/storage/aio/method_io_uring.c
+++ b/src/backend/storage/aio/method_io_uring.c
@@ -126,7 +126,7 @@ pgaio_uring_shmem_size(void)
static void
pgaio_uring_shmem_init(bool first_time)
{
- int TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS;
+ int TotalProcs = pgaio_uring_procs();
bool found;
pgaio_uring_contexts = (PgAioUringContext *)
@@ -400,9 +400,9 @@ pgaio_uring_wait_one(PgAioHandle *ioh, uint64 ref_generation)
while (true)
{
pgaio_debug_io(DEBUG3, ioh,
- "wait_one io_gen: %llu, ref_gen: %llu, cycle %d",
- (long long unsigned) ioh->generation,
- (long long unsigned) ref_generation,
+ "wait_one io_gen: %" PRIu64 ", ref_gen: %" PRIu64 ", cycle %d",
+ ioh->generation,
+ ref_generation,
waited);
if (pgaio_io_was_recycled(ioh, ref_generation, &state) ||
diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c
index 743cccc2acd..36be179678d 100644
--- a/src/backend/storage/aio/method_worker.c
+++ b/src/backend/storage/aio/method_worker.c
@@ -461,7 +461,12 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
int nwakeups = 0;
int worker;
- /* Try to get a job to do. */
+ /*
+ * Try to get a job to do.
+ *
+ * The lwlock acquisition also provides the necessary memory barrier
+ * to ensure that we don't see an outdated data in the handle.
+ */
LWLockAcquire(AioWorkerSubmissionQueueLock, LW_EXCLUSIVE);
if ((io_index = pgaio_worker_submission_queue_consume()) == UINT32_MAX)
{