diff options
-rw-r--r-- | src/backend/storage/aio/aio.c | 113 |
1 files changed, 99 insertions, 14 deletions
diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c index ebb5a771bfd..c64d815ebd1 100644 --- a/src/backend/storage/aio/aio.c +++ b/src/backend/storage/aio/aio.c @@ -184,6 +184,8 @@ pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret) PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret) { + PgAioHandle *ioh = NULL; + if (pgaio_my_backend->num_staged_ios >= PGAIO_SUBMIT_BATCH_SIZE) { Assert(pgaio_my_backend->num_staged_ios == PGAIO_SUBMIT_BATCH_SIZE); @@ -193,10 +195,17 @@ pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret) if (pgaio_my_backend->handed_out_io) elog(ERROR, "API violation: Only one IO can be handed out"); + /* + * Probably not needed today, as interrupts should not process this IO, + * but... + */ + HOLD_INTERRUPTS(); + if (!dclist_is_empty(&pgaio_my_backend->idle_ios)) { dlist_node *ion = dclist_pop_head_node(&pgaio_my_backend->idle_ios); - PgAioHandle *ioh = dclist_container(PgAioHandle, node, ion); + + ioh = dclist_container(PgAioHandle, node, ion); Assert(ioh->state == PGAIO_HS_IDLE); Assert(ioh->owner_procno == MyProcNumber); @@ -212,11 +221,11 @@ pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret) ioh->report_return = ret; ret->result.status = PGAIO_RS_UNKNOWN; } - - return ioh; } - return NULL; + RESUME_INTERRUPTS(); + + return ioh; } /* @@ -233,6 +242,12 @@ pgaio_io_release(PgAioHandle *ioh) Assert(ioh->resowner); pgaio_my_backend->handed_out_io = NULL; + + /* + * Note that no interrupts are processed between the handed_out_io + * check and the call to reclaim - that's important as otherwise an + * interrupt could have already reclaimed the handle. + */ pgaio_io_reclaim(ioh); } else @@ -251,6 +266,12 @@ pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error) Assert(ioh->resowner); + /* + * Otherwise an interrupt, in the middle of releasing the IO, could end up + * trying to wait for the IO, leading to state confusion. + */ + HOLD_INTERRUPTS(); + ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node); ioh->resowner = NULL; @@ -291,6 +312,8 @@ pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error) */ if (ioh->report_return) ioh->report_return = NULL; + + RESUME_INTERRUPTS(); } /* @@ -359,6 +382,13 @@ pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow) static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state) { + /* + * All callers need to have held interrupts in some form, otherwise + * interrupt processing could wait for the IO to complete, while in an + * intermediary state. + */ + Assert(!INTERRUPTS_CAN_BE_PROCESSED()); + pgaio_debug_io(DEBUG5, ioh, "updating state to %s", pgaio_io_state_get_name(new_state)); @@ -396,6 +426,13 @@ pgaio_io_stage(PgAioHandle *ioh, PgAioOp op) Assert(pgaio_my_backend->handed_out_io == ioh); Assert(pgaio_io_has_target(ioh)); + /* + * Otherwise an interrupt, in the middle of staging and possibly executing + * the IO, could end up trying to wait for the IO, leading to state + * confusion. + */ + HOLD_INTERRUPTS(); + ioh->op = op; ioh->result = 0; @@ -435,6 +472,8 @@ pgaio_io_stage(PgAioHandle *ioh, PgAioOp op) pgaio_io_prepare_submit(ioh); pgaio_io_perform_synchronously(ioh); } + + RESUME_INTERRUPTS(); } bool @@ -544,8 +583,8 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation) && state != PGAIO_HS_COMPLETED_SHARED && state != PGAIO_HS_COMPLETED_LOCAL) { - elog(PANIC, "waiting for own IO in wrong state: %d", - state); + elog(PANIC, "waiting for own IO %d in wrong state: %s", + pgaio_io_get_id(ioh), pgaio_io_get_state_name(ioh)); } } @@ -599,7 +638,13 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation) case PGAIO_HS_COMPLETED_SHARED: case PGAIO_HS_COMPLETED_LOCAL: - /* see above */ + + /* + * Note that no interrupts are processed between + * pgaio_io_was_recycled() and this check - that's important + * as otherwise an interrupt could have already reclaimed the + * handle. + */ if (am_owner) pgaio_io_reclaim(ioh); return; @@ -610,6 +655,11 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation) /* * Make IO handle ready to be reused after IO has completed or after the * handle has been released without being used. + * + * Note that callers need to be careful about only calling this in the right + * state and that no interrupts can be processed between the state check and + * the call to pgaio_io_reclaim(). Otherwise interrupt processing could + * already have reclaimed the handle. */ static void pgaio_io_reclaim(PgAioHandle *ioh) @@ -618,6 +668,9 @@ pgaio_io_reclaim(PgAioHandle *ioh) Assert(ioh->owner_procno == MyProcNumber); Assert(ioh->state != PGAIO_HS_IDLE); + /* see comment in function header */ + HOLD_INTERRUPTS(); + /* * It's a bit ugly, but right now the easiest place to put the execution * of local completion callbacks is this function, as we need to execute @@ -685,6 +738,8 @@ pgaio_io_reclaim(PgAioHandle *ioh) * efficient in cases where only a few IOs are used. */ dclist_push_head(&pgaio_my_backend->idle_ios, &ioh->node); + + RESUME_INTERRUPTS(); } /* @@ -700,7 +755,7 @@ pgaio_io_wait_for_free(void) pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %d in-flight, %d idle IOs", pgaio_my_backend->num_staged_ios, dclist_count(&pgaio_my_backend->in_flight_ios), - dclist_is_empty(&pgaio_my_backend->idle_ios)); + dclist_count(&pgaio_my_backend->idle_ios)); /* * First check if any of our IOs actually have completed - when using @@ -714,6 +769,11 @@ pgaio_io_wait_for_free(void) if (ioh->state == PGAIO_HS_COMPLETED_SHARED) { + /* + * Note that no interrupts are processed between the state check + * and the call to reclaim - that's important as otherwise an + * interrupt could have already reclaimed the handle. + */ pgaio_io_reclaim(ioh); reclaimed++; } @@ -730,13 +790,17 @@ pgaio_io_wait_for_free(void) if (pgaio_my_backend->num_staged_ios > 0) pgaio_submit_staged(); + /* possibly some IOs finished during submission */ + if (!dclist_is_empty(&pgaio_my_backend->idle_ios)) + return; + if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0) ereport(ERROR, errmsg_internal("no free IOs despite no in-flight IOs"), errdetail_internal("%d pending, %d in-flight, %d idle IOs", pgaio_my_backend->num_staged_ios, dclist_count(&pgaio_my_backend->in_flight_ios), - dclist_is_empty(&pgaio_my_backend->idle_ios))); + dclist_count(&pgaio_my_backend->idle_ios))); /* * Wait for the oldest in-flight IO to complete. @@ -747,6 +811,7 @@ pgaio_io_wait_for_free(void) { PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios); + uint64 generation = ioh->generation; switch (ioh->state) { @@ -770,13 +835,24 @@ pgaio_io_wait_for_free(void) * In a more general case this would be racy, because the * generation could increase after we read ioh->state above. * But we are only looking at IOs by the current backend and - * the IO can only be recycled by this backend. + * the IO can only be recycled by this backend. Even this is + * only OK because we get the handle's generation before + * potentially processing interrupts, e.g. as part of + * pgaio_debug_io(). */ - pgaio_io_wait(ioh, ioh->generation); + pgaio_io_wait(ioh, generation); break; case PGAIO_HS_COMPLETED_SHARED: - /* it's possible that another backend just finished this IO */ + + /* + * It's possible that another backend just finished this IO. + * + * Note that no interrupts are processed between the state + * check and the call to reclaim - that's important as + * otherwise an interrupt could have already reclaimed the + * handle. + */ pgaio_io_reclaim(ioh); break; } @@ -926,6 +1002,11 @@ pgaio_wref_check_done(PgAioWaitRef *iow) if (state == PGAIO_HS_COMPLETED_SHARED || state == PGAIO_HS_COMPLETED_LOCAL) { + /* + * Note that no interrupts are processed between + * pgaio_io_was_recycled() and this check - that's important as + * otherwise an interrupt could have already reclaimed the handle. + */ if (am_owner) pgaio_io_reclaim(ioh); return true; @@ -1153,11 +1234,14 @@ pgaio_closing_fd(int fd) { dlist_iter iter; PgAioHandle *ioh = NULL; + uint64 generation; dclist_foreach(iter, &pgaio_my_backend->in_flight_ios) { ioh = dclist_container(PgAioHandle, node, iter.cur); + generation = ioh->generation; + if (pgaio_io_uses_fd(ioh, fd)) break; else @@ -1172,7 +1256,7 @@ pgaio_closing_fd(int fd) fd, dclist_count(&pgaio_my_backend->in_flight_ios)); /* see comment in pgaio_io_wait_for_free() about raciness */ - pgaio_io_wait(ioh, ioh->generation); + pgaio_io_wait(ioh, generation); } } } @@ -1201,13 +1285,14 @@ pgaio_shutdown(int code, Datum arg) while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios)) { PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios); + uint64 generation = ioh->generation; pgaio_debug_io(DEBUG2, ioh, "waiting for IO to complete during shutdown, %d in-flight IOs", dclist_count(&pgaio_my_backend->in_flight_ios)); /* see comment in pgaio_io_wait_for_free() about raciness */ - pgaio_io_wait(ioh, ioh->generation); + pgaio_io_wait(ioh, generation); } pgaio_my_backend = NULL; |