aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndres Freund <andres@anarazel.de>2025-05-19 21:07:06 -0400
committerAndres Freund <andres@anarazel.de>2025-05-19 21:07:06 -0400
commitacad909321a4742a78b83cf7e975e627e786af5c (patch)
tree3a71d28970cd7b75c15c66018c4d5aa3374c1463 /src
parent29f7ce6fe78e3f8d520764b5870453d791a3ca65 (diff)
downloadpostgresql-acad909321a4742a78b83cf7e975e627e786af5c.tar.gz
postgresql-acad909321a4742a78b83cf7e975e627e786af5c.zip
aio: Fix possible state confusions due to interrupt processingHEADmaster
elog()/ereport() process interrupts, iff the log message is < ERROR and the log message will be emitted. aio's debug messages are emitted via ereport(), but in some places the code is not ready for interrupts to be processed. Fix the issue using a few different methods: 1) handle interrupts arriving concurrently - in some places it's easy to detect that by fetching the handle's generation a bit earlier 2) Check if interrupts made the work needing to be done obsolete 3) Disallow interrupts, as there's no sane way to make interrupt processing safe To prevent some similar issues from being re-introduced, assert that interrupts are held in pgaio_io_update_state(). This commit also fixes the contents of a debug message I added in 039bfc457e4. Reported-by: Alexander Lakhin <exclusion@gmail.com> Reviewed-by: Noah Misch <noah@leadboat.com> Discussion: https://postgr.es/m/mvpm7ga3dfgz7bvum22hmuz26cariylmcppb3irayftc7bwk3r@l7gb6gr7azhc
Diffstat (limited to 'src')
-rw-r--r--src/backend/storage/aio/aio.c113
1 files changed, 99 insertions, 14 deletions
diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c
index ebb5a771bfd..c64d815ebd1 100644
--- a/src/backend/storage/aio/aio.c
+++ b/src/backend/storage/aio/aio.c
@@ -184,6 +184,8 @@ pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
PgAioHandle *
pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
{
+ PgAioHandle *ioh = NULL;
+
if (pgaio_my_backend->num_staged_ios >= PGAIO_SUBMIT_BATCH_SIZE)
{
Assert(pgaio_my_backend->num_staged_ios == PGAIO_SUBMIT_BATCH_SIZE);
@@ -193,10 +195,17 @@ pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
if (pgaio_my_backend->handed_out_io)
elog(ERROR, "API violation: Only one IO can be handed out");
+ /*
+ * Probably not needed today, as interrupts should not process this IO,
+ * but...
+ */
+ HOLD_INTERRUPTS();
+
if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
{
dlist_node *ion = dclist_pop_head_node(&pgaio_my_backend->idle_ios);
- PgAioHandle *ioh = dclist_container(PgAioHandle, node, ion);
+
+ ioh = dclist_container(PgAioHandle, node, ion);
Assert(ioh->state == PGAIO_HS_IDLE);
Assert(ioh->owner_procno == MyProcNumber);
@@ -212,11 +221,11 @@ pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
ioh->report_return = ret;
ret->result.status = PGAIO_RS_UNKNOWN;
}
-
- return ioh;
}
- return NULL;
+ RESUME_INTERRUPTS();
+
+ return ioh;
}
/*
@@ -233,6 +242,12 @@ pgaio_io_release(PgAioHandle *ioh)
Assert(ioh->resowner);
pgaio_my_backend->handed_out_io = NULL;
+
+ /*
+ * Note that no interrupts are processed between the handed_out_io
+ * check and the call to reclaim - that's important as otherwise an
+ * interrupt could have already reclaimed the handle.
+ */
pgaio_io_reclaim(ioh);
}
else
@@ -251,6 +266,12 @@ pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
Assert(ioh->resowner);
+ /*
+ * Otherwise an interrupt, in the middle of releasing the IO, could end up
+ * trying to wait for the IO, leading to state confusion.
+ */
+ HOLD_INTERRUPTS();
+
ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
ioh->resowner = NULL;
@@ -291,6 +312,8 @@ pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
*/
if (ioh->report_return)
ioh->report_return = NULL;
+
+ RESUME_INTERRUPTS();
}
/*
@@ -359,6 +382,13 @@ pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
static inline void
pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
{
+ /*
+ * All callers need to have held interrupts in some form, otherwise
+ * interrupt processing could wait for the IO to complete, while in an
+ * intermediary state.
+ */
+ Assert(!INTERRUPTS_CAN_BE_PROCESSED());
+
pgaio_debug_io(DEBUG5, ioh,
"updating state to %s",
pgaio_io_state_get_name(new_state));
@@ -396,6 +426,13 @@ pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
Assert(pgaio_my_backend->handed_out_io == ioh);
Assert(pgaio_io_has_target(ioh));
+ /*
+ * Otherwise an interrupt, in the middle of staging and possibly executing
+ * the IO, could end up trying to wait for the IO, leading to state
+ * confusion.
+ */
+ HOLD_INTERRUPTS();
+
ioh->op = op;
ioh->result = 0;
@@ -435,6 +472,8 @@ pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
pgaio_io_prepare_submit(ioh);
pgaio_io_perform_synchronously(ioh);
}
+
+ RESUME_INTERRUPTS();
}
bool
@@ -544,8 +583,8 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
&& state != PGAIO_HS_COMPLETED_SHARED
&& state != PGAIO_HS_COMPLETED_LOCAL)
{
- elog(PANIC, "waiting for own IO in wrong state: %d",
- state);
+ elog(PANIC, "waiting for own IO %d in wrong state: %s",
+ pgaio_io_get_id(ioh), pgaio_io_get_state_name(ioh));
}
}
@@ -599,7 +638,13 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
case PGAIO_HS_COMPLETED_SHARED:
case PGAIO_HS_COMPLETED_LOCAL:
- /* see above */
+
+ /*
+ * Note that no interrupts are processed between
+ * pgaio_io_was_recycled() and this check - that's important
+ * as otherwise an interrupt could have already reclaimed the
+ * handle.
+ */
if (am_owner)
pgaio_io_reclaim(ioh);
return;
@@ -610,6 +655,11 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
/*
* Make IO handle ready to be reused after IO has completed or after the
* handle has been released without being used.
+ *
+ * Note that callers need to be careful about only calling this in the right
+ * state and that no interrupts can be processed between the state check and
+ * the call to pgaio_io_reclaim(). Otherwise interrupt processing could
+ * already have reclaimed the handle.
*/
static void
pgaio_io_reclaim(PgAioHandle *ioh)
@@ -618,6 +668,9 @@ pgaio_io_reclaim(PgAioHandle *ioh)
Assert(ioh->owner_procno == MyProcNumber);
Assert(ioh->state != PGAIO_HS_IDLE);
+ /* see comment in function header */
+ HOLD_INTERRUPTS();
+
/*
* It's a bit ugly, but right now the easiest place to put the execution
* of local completion callbacks is this function, as we need to execute
@@ -685,6 +738,8 @@ pgaio_io_reclaim(PgAioHandle *ioh)
* efficient in cases where only a few IOs are used.
*/
dclist_push_head(&pgaio_my_backend->idle_ios, &ioh->node);
+
+ RESUME_INTERRUPTS();
}
/*
@@ -700,7 +755,7 @@ pgaio_io_wait_for_free(void)
pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %d in-flight, %d idle IOs",
pgaio_my_backend->num_staged_ios,
dclist_count(&pgaio_my_backend->in_flight_ios),
- dclist_is_empty(&pgaio_my_backend->idle_ios));
+ dclist_count(&pgaio_my_backend->idle_ios));
/*
* First check if any of our IOs actually have completed - when using
@@ -714,6 +769,11 @@ pgaio_io_wait_for_free(void)
if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
{
+ /*
+ * Note that no interrupts are processed between the state check
+ * and the call to reclaim - that's important as otherwise an
+ * interrupt could have already reclaimed the handle.
+ */
pgaio_io_reclaim(ioh);
reclaimed++;
}
@@ -730,13 +790,17 @@ pgaio_io_wait_for_free(void)
if (pgaio_my_backend->num_staged_ios > 0)
pgaio_submit_staged();
+ /* possibly some IOs finished during submission */
+ if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
+ return;
+
if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0)
ereport(ERROR,
errmsg_internal("no free IOs despite no in-flight IOs"),
errdetail_internal("%d pending, %d in-flight, %d idle IOs",
pgaio_my_backend->num_staged_ios,
dclist_count(&pgaio_my_backend->in_flight_ios),
- dclist_is_empty(&pgaio_my_backend->idle_ios)));
+ dclist_count(&pgaio_my_backend->idle_ios)));
/*
* Wait for the oldest in-flight IO to complete.
@@ -747,6 +811,7 @@ pgaio_io_wait_for_free(void)
{
PgAioHandle *ioh = dclist_head_element(PgAioHandle, node,
&pgaio_my_backend->in_flight_ios);
+ uint64 generation = ioh->generation;
switch (ioh->state)
{
@@ -770,13 +835,24 @@ pgaio_io_wait_for_free(void)
* In a more general case this would be racy, because the
* generation could increase after we read ioh->state above.
* But we are only looking at IOs by the current backend and
- * the IO can only be recycled by this backend.
+ * the IO can only be recycled by this backend. Even this is
+ * only OK because we get the handle's generation before
+ * potentially processing interrupts, e.g. as part of
+ * pgaio_debug_io().
*/
- pgaio_io_wait(ioh, ioh->generation);
+ pgaio_io_wait(ioh, generation);
break;
case PGAIO_HS_COMPLETED_SHARED:
- /* it's possible that another backend just finished this IO */
+
+ /*
+ * It's possible that another backend just finished this IO.
+ *
+ * Note that no interrupts are processed between the state
+ * check and the call to reclaim - that's important as
+ * otherwise an interrupt could have already reclaimed the
+ * handle.
+ */
pgaio_io_reclaim(ioh);
break;
}
@@ -926,6 +1002,11 @@ pgaio_wref_check_done(PgAioWaitRef *iow)
if (state == PGAIO_HS_COMPLETED_SHARED ||
state == PGAIO_HS_COMPLETED_LOCAL)
{
+ /*
+ * Note that no interrupts are processed between
+ * pgaio_io_was_recycled() and this check - that's important as
+ * otherwise an interrupt could have already reclaimed the handle.
+ */
if (am_owner)
pgaio_io_reclaim(ioh);
return true;
@@ -1153,11 +1234,14 @@ pgaio_closing_fd(int fd)
{
dlist_iter iter;
PgAioHandle *ioh = NULL;
+ uint64 generation;
dclist_foreach(iter, &pgaio_my_backend->in_flight_ios)
{
ioh = dclist_container(PgAioHandle, node, iter.cur);
+ generation = ioh->generation;
+
if (pgaio_io_uses_fd(ioh, fd))
break;
else
@@ -1172,7 +1256,7 @@ pgaio_closing_fd(int fd)
fd, dclist_count(&pgaio_my_backend->in_flight_ios));
/* see comment in pgaio_io_wait_for_free() about raciness */
- pgaio_io_wait(ioh, ioh->generation);
+ pgaio_io_wait(ioh, generation);
}
}
}
@@ -1201,13 +1285,14 @@ pgaio_shutdown(int code, Datum arg)
while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
{
PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios);
+ uint64 generation = ioh->generation;
pgaio_debug_io(DEBUG2, ioh,
"waiting for IO to complete during shutdown, %d in-flight IOs",
dclist_count(&pgaio_my_backend->in_flight_ios));
/* see comment in pgaio_io_wait_for_free() about raciness */
- pgaio_io_wait(ioh, ioh->generation);
+ pgaio_io_wait(ioh, generation);
}
pgaio_my_backend = NULL;