From 9011bf9a13e3b5710c3cfc330da829ee25b5a029 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 30 Jun 2021 21:54:03 +0100 Subject: io_uring: fix stuck fallback reqs When task_work_add() fails, we use ->exit_task_work to queue the work. That will be run only in the cancellation path, which happens either when the ctx is dying or one of tasks with inflight requests is exiting or executing. There is a good chance that such a request would just get stuck in the list potentially hodling a file, all io_uring rsrc recycling or some other resources. Nothing terrible, it'll go away at some point, but we don't want to lock them up for longer than needed. Replace that hand made ->exit_task_work with delayed_work + llist inspired by fput_many(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 61 ++++++++++++++++++++--------------------------------------- 1 file changed, 20 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index e55b21fc0ab2..ceb122a4ff78 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -465,7 +465,8 @@ struct io_ring_ctx { struct mm_struct *mm_account; /* ctx exit and cancelation */ - struct callback_head *exit_task_work; + struct llist_head fallback_llist; + struct delayed_work fallback_work; struct work_struct exit_work; struct list_head tctx_list; struct completion ref_comp; @@ -859,6 +860,8 @@ struct io_kiocb { struct io_wq_work work; const struct cred *creds; + struct llist_node fallback_node; + /* store used ubuf, so we can prevent reloading */ struct io_mapped_ubuf *imu; }; @@ -1071,6 +1074,8 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx); static bool io_poll_remove_waitqs(struct io_kiocb *req); static int io_req_prep_async(struct io_kiocb *req); +static void io_fallback_req_func(struct work_struct *unused); + static struct kmem_cache *req_cachep; static const struct file_operations io_uring_fops; @@ -1202,6 +1207,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->tctx_list); INIT_LIST_HEAD(&ctx->submit_state.comp.free_list); INIT_LIST_HEAD(&ctx->locked_free_list); + INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); return ctx; err: kfree(ctx->dummy_ubuf); @@ -1999,44 +2005,12 @@ static int io_req_task_work_add(struct io_kiocb *req) return ret; } -static bool io_run_task_work_head(struct callback_head **work_head) -{ - struct callback_head *work, *next; - bool executed = false; - - do { - work = xchg(work_head, NULL); - if (!work) - break; - - do { - next = work->next; - work->func(work); - work = next; - cond_resched(); - } while (work); - executed = true; - } while (1); - - return executed; -} - -static void io_task_work_add_head(struct callback_head **work_head, - struct callback_head *task_work) -{ - struct callback_head *head; - - do { - head = READ_ONCE(*work_head); - task_work->next = head; - } while (cmpxchg(work_head, head, task_work) != head); -} - static void io_req_task_work_add_fallback(struct io_kiocb *req, task_work_func_t cb) { init_task_work(&req->task_work, cb); - io_task_work_add_head(&req->ctx->exit_task_work, &req->task_work); + if (llist_add(&req->fallback_node, &req->ctx->fallback_llist)) + schedule_delayed_work(&req->ctx->fallback_work, 1); } static void io_req_task_cancel(struct callback_head *cb) @@ -2485,6 +2459,17 @@ static bool io_rw_should_reissue(struct io_kiocb *req) } #endif +static void io_fallback_req_func(struct work_struct *work) +{ + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, + fallback_work.work); + struct llist_node *node = llist_del_all(&ctx->fallback_llist); + struct io_kiocb *req, *tmp; + + llist_for_each_entry_safe(req, tmp, node, fallback_node) + req->task_work.func(&req->task_work); +} + static void __io_complete_rw(struct io_kiocb *req, long res, long res2, unsigned int issue_flags) { @@ -8767,11 +8752,6 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) return -EINVAL; } -static inline bool io_run_ctx_fallback(struct io_ring_ctx *ctx) -{ - return io_run_task_work_head(&ctx->exit_task_work); -} - struct io_tctx_exit { struct callback_head task_work; struct completion completion; @@ -9036,7 +9016,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, ret |= io_kill_timeouts(ctx, task, cancel_all); if (task) ret |= io_run_task_work(); - ret |= io_run_ctx_fallback(ctx); if (!ret) break; cond_resched(); -- cgit v1.2.3-59-g8ed1b From 5b0a6acc73fcac5f7d17badd09275bf7b9b46603 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 30 Jun 2021 21:54:04 +0100 Subject: io_uring: simplify task_work func Since we don't really use req->task_work anymore, get rid of it together with the nasty ->func aliasing between ->io_task_work and ->task_work, and hide ->fallback_node inside of io_task_work. Also, as task_work is gone now, replace the callback type from task_work_func_t to a function taking io_kiocb to avoid casting and simplify code. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 72 +++++++++++++++++++++++------------------------------------ 1 file changed, 28 insertions(+), 44 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index ceb122a4ff78..5b840bb1e8ec 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -785,9 +785,14 @@ struct async_poll { struct io_poll_iocb *double_poll; }; +typedef void (*io_req_tw_func_t)(struct io_kiocb *req); + struct io_task_work { - struct io_wq_work_node node; - task_work_func_t func; + union { + struct io_wq_work_node node; + struct llist_node fallback_node; + }; + io_req_tw_func_t func; }; enum { @@ -850,18 +855,13 @@ struct io_kiocb { /* used with ctx->iopoll_list with reads/writes */ struct list_head inflight_entry; - union { - struct io_task_work io_task_work; - struct callback_head task_work; - }; + struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; struct async_poll *apoll; struct io_wq_work work; const struct cred *creds; - struct llist_node fallback_node; - /* store used ubuf, so we can prevent reloading */ struct io_mapped_ubuf *imu; }; @@ -1935,7 +1935,7 @@ static void tctx_task_work(struct callback_head *cb) ctx = req->ctx; percpu_ref_get(&ctx->refs); } - req->task_work.func(&req->task_work); + req->io_task_work.func(req); node = next; } if (wq_list_empty(&tctx->task_list)) { @@ -2006,16 +2006,16 @@ static int io_req_task_work_add(struct io_kiocb *req) } static void io_req_task_work_add_fallback(struct io_kiocb *req, - task_work_func_t cb) + io_req_tw_func_t cb) { - init_task_work(&req->task_work, cb); - if (llist_add(&req->fallback_node, &req->ctx->fallback_llist)) + req->io_task_work.func = cb; + if (llist_add(&req->io_task_work.fallback_node, + &req->ctx->fallback_llist)) schedule_delayed_work(&req->ctx->fallback_work, 1); } -static void io_req_task_cancel(struct callback_head *cb) +static void io_req_task_cancel(struct io_kiocb *req) { - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct io_ring_ctx *ctx = req->ctx; /* ctx is guaranteed to stay alive while we hold uring_lock */ @@ -2024,7 +2024,7 @@ static void io_req_task_cancel(struct callback_head *cb) mutex_unlock(&ctx->uring_lock); } -static void __io_req_task_submit(struct io_kiocb *req) +static void io_req_task_submit(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -2037,17 +2037,10 @@ static void __io_req_task_submit(struct io_kiocb *req) mutex_unlock(&ctx->uring_lock); } -static void io_req_task_submit(struct callback_head *cb) -{ - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); - - __io_req_task_submit(req); -} - static void io_req_task_queue_fail(struct io_kiocb *req, int ret) { req->result = ret; - req->task_work.func = io_req_task_cancel; + req->io_task_work.func = io_req_task_cancel; if (unlikely(io_req_task_work_add(req))) io_req_task_work_add_fallback(req, io_req_task_cancel); @@ -2055,7 +2048,7 @@ static void io_req_task_queue_fail(struct io_kiocb *req, int ret) static void io_req_task_queue(struct io_kiocb *req) { - req->task_work.func = io_req_task_submit; + req->io_task_work.func = io_req_task_submit; if (unlikely(io_req_task_work_add(req))) io_req_task_queue_fail(req, -ECANCELED); @@ -2169,18 +2162,11 @@ static inline void io_put_req(struct io_kiocb *req) io_free_req(req); } -static void io_put_req_deferred_cb(struct callback_head *cb) -{ - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); - - io_free_req(req); -} - static void io_free_req_deferred(struct io_kiocb *req) { - req->task_work.func = io_put_req_deferred_cb; + req->io_task_work.func = io_free_req; if (unlikely(io_req_task_work_add(req))) - io_req_task_work_add_fallback(req, io_put_req_deferred_cb); + io_req_task_work_add_fallback(req, io_free_req); } static inline void io_put_req_deferred(struct io_kiocb *req, int refs) @@ -2466,8 +2452,8 @@ static void io_fallback_req_func(struct work_struct *work) struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; - llist_for_each_entry_safe(req, tmp, node, fallback_node) - req->task_work.func(&req->task_work); + llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) + req->io_task_work.func(req); } static void __io_complete_rw(struct io_kiocb *req, long res, long res2, @@ -4835,7 +4821,7 @@ struct io_poll_table { }; static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, - __poll_t mask, task_work_func_t func) + __poll_t mask, io_req_tw_func_t func) { int ret; @@ -4848,7 +4834,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, list_del_init(&poll->wait.entry); req->result = mask; - req->task_work.func = func; + req->io_task_work.func = func; /* * If this fails, then the task is exiting. When a task exits, the @@ -4945,9 +4931,8 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask) return !(flags & IORING_CQE_F_MORE); } -static void io_poll_task_func(struct callback_head *cb) +static void io_poll_task_func(struct io_kiocb *req) { - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *nxt; @@ -4969,7 +4954,7 @@ static void io_poll_task_func(struct callback_head *cb) if (done) { nxt = io_put_req_find_next(req); if (nxt) - __io_req_task_submit(nxt); + io_req_task_submit(nxt); } } } @@ -5078,9 +5063,8 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } -static void io_async_task_func(struct callback_head *cb) +static void io_async_task_func(struct io_kiocb *req) { - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct async_poll *apoll = req->apoll; struct io_ring_ctx *ctx = req->ctx; @@ -5096,7 +5080,7 @@ static void io_async_task_func(struct callback_head *cb) spin_unlock_irq(&ctx->completion_lock); if (!READ_ONCE(apoll->poll.canceled)) - __io_req_task_submit(req); + io_req_task_submit(req); else io_req_complete_failed(req, -ECANCELED); } @@ -8817,7 +8801,7 @@ static void io_ring_exit_work(struct work_struct *work) /* * Some may use context even when all refs and requests have been put, * and they are free to do so while still holding uring_lock or - * completion_lock, see __io_req_task_submit(). Apart from other work, + * completion_lock, see io_req_task_submit(). Apart from other work, * this lock/unlock section also waits them to finish. */ mutex_lock(&ctx->uring_lock); -- cgit v1.2.3-59-g8ed1b From e09ee510600b941c62e94f6b59878cf53ba0e447 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 1 Jul 2021 13:26:05 +0100 Subject: io_uring: fix exiting io_req_task_work_add leaks If one entered io_req_task_work_add() not seeing PF_EXITING, it will set a ->task_state bit and try task_work_add(), which may fail by that moment. If that happens the function would try to cancel the request. However, in a meanwhile there might come other io_req_task_work_add() callers, which will see the bit set and leave their requests in the list, which will never be executed. Don't propagate an error, but clear the bit first and then fallback all requests that we can splice from the list. The callback functions have to be able to deal with PF_EXITING, so poll and apoll was modified via changing io_poll_rewait(). Fixes: 7cbf1722d5fc ("io_uring: provide FIFO ordering for task_work") Reported-by: Jens Axboe Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/060002f19f1fdbd130ba24aef818ea4d3080819b.1625142209.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 70 ++++++++++++++++++++--------------------------------------- 1 file changed, 24 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5b840bb1e8ec..881856088990 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1952,17 +1952,13 @@ static void tctx_task_work(struct callback_head *cb) ctx_flush_and_put(ctx); } -static int io_req_task_work_add(struct io_kiocb *req) +static void io_req_task_work_add(struct io_kiocb *req) { struct task_struct *tsk = req->task; struct io_uring_task *tctx = tsk->io_uring; enum task_work_notify_mode notify; - struct io_wq_work_node *node, *prev; + struct io_wq_work_node *node; unsigned long flags; - int ret = 0; - - if (unlikely(tsk->flags & PF_EXITING)) - return -ESRCH; WARN_ON_ONCE(!tctx); @@ -1973,7 +1969,9 @@ static int io_req_task_work_add(struct io_kiocb *req) /* task_work already pending, we're done */ if (test_bit(0, &tctx->task_state) || test_and_set_bit(0, &tctx->task_state)) - return 0; + return; + if (unlikely(tsk->flags & PF_EXITING)) + goto fail; /* * SQPOLL kernel thread doesn't need notification, just a wakeup. For @@ -1982,36 +1980,24 @@ static int io_req_task_work_add(struct io_kiocb *req) * will do the job. */ notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; - if (!task_work_add(tsk, &tctx->task_work, notify)) { wake_up_process(tsk); - return 0; + return; } - - /* - * Slow path - we failed, find and delete work. if the work is not - * in the list, it got run and we're fine. - */ +fail: + clear_bit(0, &tctx->task_state); spin_lock_irqsave(&tctx->task_lock, flags); - wq_list_for_each(node, prev, &tctx->task_list) { - if (&req->io_task_work.node == node) { - wq_list_del(&tctx->task_list, node, prev); - ret = 1; - break; - } - } + node = tctx->task_list.first; + INIT_WQ_LIST(&tctx->task_list); spin_unlock_irqrestore(&tctx->task_lock, flags); - clear_bit(0, &tctx->task_state); - return ret; -} -static void io_req_task_work_add_fallback(struct io_kiocb *req, - io_req_tw_func_t cb) -{ - req->io_task_work.func = cb; - if (llist_add(&req->io_task_work.fallback_node, - &req->ctx->fallback_llist)) - schedule_delayed_work(&req->ctx->fallback_work, 1); + while (node) { + req = container_of(node, struct io_kiocb, io_task_work.node); + node = node->next; + if (llist_add(&req->io_task_work.fallback_node, + &req->ctx->fallback_llist)) + schedule_delayed_work(&req->ctx->fallback_work, 1); + } } static void io_req_task_cancel(struct io_kiocb *req) @@ -2041,17 +2027,13 @@ static void io_req_task_queue_fail(struct io_kiocb *req, int ret) { req->result = ret; req->io_task_work.func = io_req_task_cancel; - - if (unlikely(io_req_task_work_add(req))) - io_req_task_work_add_fallback(req, io_req_task_cancel); + io_req_task_work_add(req); } static void io_req_task_queue(struct io_kiocb *req) { req->io_task_work.func = io_req_task_submit; - - if (unlikely(io_req_task_work_add(req))) - io_req_task_queue_fail(req, -ECANCELED); + io_req_task_work_add(req); } static inline void io_queue_next(struct io_kiocb *req) @@ -2165,8 +2147,7 @@ static inline void io_put_req(struct io_kiocb *req) static void io_free_req_deferred(struct io_kiocb *req) { req->io_task_work.func = io_free_req; - if (unlikely(io_req_task_work_add(req))) - io_req_task_work_add_fallback(req, io_free_req); + io_req_task_work_add(req); } static inline void io_put_req_deferred(struct io_kiocb *req, int refs) @@ -4823,8 +4804,6 @@ struct io_poll_table { static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, __poll_t mask, io_req_tw_func_t func) { - int ret; - /* for instances that support it check for an event match first: */ if (mask && !(mask & poll->events)) return 0; @@ -4842,11 +4821,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, * of executing it. We can't safely execute it anyway, as we may not * have the needed state needed for it anyway. */ - ret = io_req_task_work_add(req); - if (unlikely(ret)) { - WRITE_ONCE(poll->canceled, true); - io_req_task_work_add_fallback(req, func); - } + io_req_task_work_add(req); return 1; } @@ -4855,6 +4830,9 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) { struct io_ring_ctx *ctx = req->ctx; + if (unlikely(req->task->flags & PF_EXITING)) + WRITE_ONCE(poll->canceled, true); + if (!req->result && !READ_ONCE(poll->canceled)) { struct poll_table_struct pt = { ._key = poll->events }; -- cgit v1.2.3-59-g8ed1b From c32aace0cf93383fde48c60ce0ae0c9073b6d360 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Jul 2021 19:24:24 +0100 Subject: io_uring: fix drain alloc fail return code After a recent change io_drain_req() started to fail requests with result=0 in case of allocation failure, where it should be and have been -ENOMEM. Fixes: 76cc33d79175a ("io_uring: refactor io_req_defer()") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e068110ac4293e0c56cfc4d280d0f22b9303ec08.1625682153.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 881856088990..8f2a66903f5a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6019,7 +6019,7 @@ static bool io_drain_req(struct io_kiocb *req) io_prep_async_link(req); de = kmalloc(sizeof(*de), GFP_KERNEL); if (!de) { - io_req_complete_failed(req, ret); + io_req_complete_failed(req, -ENOMEM); return true; } -- cgit v1.2.3-59-g8ed1b From 8f487ef2cbb2d4f6ca8c113d70da63baaf68c91a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 8 Jul 2021 13:37:06 +0100 Subject: io_uring: mitigate unlikely iopoll lag We have requests like IORING_OP_FILES_UPDATE that don't go through ->iopoll_list but get completed in place under ->uring_lock, and so after dropping the lock io_iopoll_check() should expect that some CQEs might have get completed in a meanwhile. Currently such events won't be accounted in @nr_events, and the loop will continue to poll even if there is enough of CQEs. It shouldn't be a problem as it's not likely to happen and so, but not nice either. Just return earlier in this case, it should be enough. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/66ef932cc66a34e3771bbae04b2953a8058e9d05.1625747741.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8f2a66903f5a..7167c61c6d1b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2356,11 +2356,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) * very same mutex. */ if (list_empty(&ctx->iopoll_list)) { + u32 tail = ctx->cached_cq_tail; + mutex_unlock(&ctx->uring_lock); io_run_task_work(); mutex_lock(&ctx->uring_lock); - if (list_empty(&ctx->iopoll_list)) + /* some requests don't go through iopoll_list */ + if (tail != ctx->cached_cq_tail || + list_empty(&ctx->iopoll_list)) break; } ret = io_do_iopoll(ctx, &nr_events, min); -- cgit v1.2.3-59-g8ed1b From 9ce85ef2cb5c738754837a6937e120694cde33c9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Jul 2021 08:20:28 -0600 Subject: io_uring: remove dead non-zero 'poll' check Colin reports that Coverity complains about checking for poll being non-zero after having dereferenced it multiple times. This is a valid complaint, and actually a leftover from back when this code was based on the aio poll code. Kill the redundant check. Link: https://lore.kernel.org/io-uring/fe70c532-e2a7-3722-58a1-0fa4e5c5ff2c@canonical.com/ Reported-by: Colin Ian King Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 7167c61c6d1b..d94fb5835a20 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4956,7 +4956,7 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, list_del_init(&wait->entry); - if (poll && poll->head) { + if (poll->head) { bool done; spin_lock(&poll->head->lock); -- cgit v1.2.3-59-g8ed1b