From fc04c39bae01a607454f7619665309870c60937a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 1 Mar 2020 19:18:19 +0300
Subject: io-wq: fix IO_WQ_WORK_NO_CANCEL cancellation

To cancel a work, io-wq sets IO_WQ_WORK_CANCEL and executes the
callback. However, IO_WQ_WORK_NO_CANCEL works will just execute and may
return next work, which will be ignored and lost.

Cancel the whole link.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index bf8ed1b0b90a..9a7aacc96d84 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -747,6 +747,17 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
 	return true;
 }
 
+static void io_run_cancel(struct io_wq_work *work)
+{
+	do {
+		struct io_wq_work *old_work = work;
+
+		work->flags |= IO_WQ_WORK_CANCEL;
+		work->func(&work);
+		work = (work == old_work) ? NULL : work;
+	} while (work);
+}
+
 static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 {
 	struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
@@ -760,8 +771,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 	 * It's close enough to not be an issue, fork() has the same delay.
 	 */
 	if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
-		work->flags |= IO_WQ_WORK_CANCEL;
-		work->func(&work);
+		io_run_cancel(work);
 		return;
 	}
 
@@ -900,8 +910,7 @@ static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
 	spin_unlock_irqrestore(&wqe->lock, flags);
 
 	if (found) {
-		work->flags |= IO_WQ_WORK_CANCEL;
-		work->func(&work);
+		io_run_cancel(work);
 		return IO_WQ_CANCEL_OK;
 	}
 
@@ -976,8 +985,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
 	spin_unlock_irqrestore(&wqe->lock, flags);
 
 	if (found) {
-		work->flags |= IO_WQ_WORK_CANCEL;
-		work->func(&work);
+		io_run_cancel(work);
 		return IO_WQ_CANCEL_OK;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 80ad894382bf1d73eb688c29714fa10c0afcf2e7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 2 Mar 2020 23:46:10 +0300
Subject: io-wq: remove io_wq_flush and IO_WQ_WORK_INTERNAL

io_wq_flush() is buggy, during cancelation of a flush, the associated
work may be passed to the caller's (i.e. io_uring) @match callback. That
callback is expecting it to be embedded in struct io_kiocb. Cancelation
of internal work probably doesn't make a lot of sense to begin with.

As the flush helper is no longer used, just delete it and the associated
work flag.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 38 +-------------------------------------
 fs/io-wq.h |  2 --
 2 files changed, 1 insertion(+), 39 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 9a7aacc96d84..5cef075c0b37 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -502,7 +502,7 @@ next:
 		if (worker->mm)
 			work->flags |= IO_WQ_WORK_HAS_MM;
 
-		if (wq->get_work && !(work->flags & IO_WQ_WORK_INTERNAL)) {
+		if (wq->get_work) {
 			put_work = work;
 			wq->get_work(work);
 		}
@@ -1057,42 +1057,6 @@ enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
 	return ret;
 }
 
-struct io_wq_flush_data {
-	struct io_wq_work work;
-	struct completion done;
-};
-
-static void io_wq_flush_func(struct io_wq_work **workptr)
-{
-	struct io_wq_work *work = *workptr;
-	struct io_wq_flush_data *data;
-
-	data = container_of(work, struct io_wq_flush_data, work);
-	complete(&data->done);
-}
-
-/*
- * Doesn't wait for previously queued work to finish. When this completes,
- * it just means that previously queued work was started.
- */
-void io_wq_flush(struct io_wq *wq)
-{
-	struct io_wq_flush_data data;
-	int node;
-
-	for_each_node(node) {
-		struct io_wqe *wqe = wq->wqes[node];
-
-		if (!node_online(node))
-			continue;
-		init_completion(&data.done);
-		INIT_IO_WORK(&data.work, io_wq_flush_func);
-		data.work.flags |= IO_WQ_WORK_INTERNAL;
-		io_wqe_enqueue(wqe, &data.work);
-		wait_for_completion(&data.done);
-	}
-}
-
 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 {
 	int ret = -ENOMEM, node;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 33baba4370c5..e5e15f2c93ec 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -8,7 +8,6 @@ enum {
 	IO_WQ_WORK_HAS_MM	= 2,
 	IO_WQ_WORK_HASHED	= 4,
 	IO_WQ_WORK_UNBOUND	= 32,
-	IO_WQ_WORK_INTERNAL	= 64,
 	IO_WQ_WORK_CB		= 128,
 	IO_WQ_WORK_NO_CANCEL	= 256,
 	IO_WQ_WORK_CONCURRENT	= 512,
@@ -100,7 +99,6 @@ void io_wq_destroy(struct io_wq *wq);
 
 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
 void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val);
-void io_wq_flush(struct io_wq *wq);
 
 void io_wq_cancel_all(struct io_wq *wq);
 enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
-- 
cgit v1.2.3-59-g8ed1b


From c1e2148f8ecb26863b899d402a823dab8e26efd1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 4 Mar 2020 07:25:50 -0700
Subject: io_uring: free fixed_file_data after RCU grace period

The percpu refcount protects this structure, and we can have an atomic
switch in progress when exiting. This makes it unsafe to just free the
struct normally, and can trigger the following KASAN warning:

BUG: KASAN: use-after-free in percpu_ref_switch_to_atomic_rcu+0xfa/0x1b0
Read of size 1 at addr ffff888181a19a30 by task swapper/0/0

CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.6.0-rc4+ #5747
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
Call Trace:
 <IRQ>
 dump_stack+0x76/0xa0
 print_address_description.constprop.0+0x3b/0x60
 ? percpu_ref_switch_to_atomic_rcu+0xfa/0x1b0
 ? percpu_ref_switch_to_atomic_rcu+0xfa/0x1b0
 __kasan_report.cold+0x1a/0x3d
 ? percpu_ref_switch_to_atomic_rcu+0xfa/0x1b0
 percpu_ref_switch_to_atomic_rcu+0xfa/0x1b0
 rcu_core+0x370/0x830
 ? percpu_ref_exit+0x50/0x50
 ? rcu_note_context_switch+0x7b0/0x7b0
 ? run_rebalance_domains+0x11d/0x140
 __do_softirq+0x10a/0x3e9
 irq_exit+0xd5/0xe0
 smp_apic_timer_interrupt+0x86/0x200
 apic_timer_interrupt+0xf/0x20
 </IRQ>
RIP: 0010:default_idle+0x26/0x1f0

Fix this by punting the final exit and free of the struct to RCU, then
we know that it's safe to do so. Jann suggested the approach of using a
double rcu callback to achieve this. It's important that we do a nested
call_rcu() callback, as otherwise the free could be ordered before the
atomic switch, even if the latter was already queued.

Reported-by: syzbot+e017e49c39ab484ac87a@syzkaller.appspotmail.com
Suggested-by: Jann Horn <jannh@google.com>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6a595c13e108..68050b61ad0e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -191,6 +191,7 @@ struct fixed_file_data {
 	struct llist_head		put_llist;
 	struct work_struct		ref_work;
 	struct completion		done;
+	struct rcu_head			rcu;
 };
 
 struct io_ring_ctx {
@@ -5329,6 +5330,26 @@ static void io_file_ref_kill(struct percpu_ref *ref)
 	complete(&data->done);
 }
 
+static void __io_file_ref_exit_and_free(struct rcu_head *rcu)
+{
+	struct fixed_file_data *data = container_of(rcu, struct fixed_file_data,
+							rcu);
+	percpu_ref_exit(&data->refs);
+	kfree(data);
+}
+
+static void io_file_ref_exit_and_free(struct rcu_head *rcu)
+{
+	/*
+	 * We need to order our exit+free call against the potentially
+	 * existing call_rcu() for switching to atomic. One way to do that
+	 * is to have this rcu callback queue the final put and free, as we
+	 * could otherwise have a pre-existing atomic switch complete _after_
+	 * the free callback we queued.
+	 */
+	call_rcu(rcu, __io_file_ref_exit_and_free);
+}
+
 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
 	struct fixed_file_data *data = ctx->file_data;
@@ -5341,14 +5362,13 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	flush_work(&data->ref_work);
 	wait_for_completion(&data->done);
 	io_ring_file_ref_flush(data);
-	percpu_ref_exit(&data->refs);
 
 	__io_sqe_files_unregister(ctx);
 	nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
 	for (i = 0; i < nr_tables; i++)
 		kfree(data->table[i].files);
 	kfree(data->table);
-	kfree(data);
+	call_rcu(&data->rcu, io_file_ref_exit_and_free);
 	ctx->file_data = NULL;
 	ctx->nr_user_files = 0;
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From f0e20b8943509d81200cef5e30af2adfddba0f5c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 7 Mar 2020 01:15:22 +0300
Subject: io_uring: fix lockup with timeouts

There is a recipe to deadlock the kernel: submit a timeout sqe with a
linked_timeout (e.g.  test_single_link_timeout_ception() from liburing),
and SIGKILL the process.

Then, io_kill_timeouts() takes @ctx->completion_lock, but the timeout
isn't flagged with REQ_F_COMP_LOCKED, and will try to double grab it
during io_put_free() to cancel the linked timeout. Probably, the same
can happen with another io_kill_timeout() call site, that is
io_commit_cqring().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 68050b61ad0e..c06082bb039a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1000,6 +1000,7 @@ static void io_kill_timeout(struct io_kiocb *req)
 	if (ret != -1) {
 		atomic_inc(&req->ctx->cq_timeouts);
 		list_del_init(&req->list);
+		req->flags |= REQ_F_COMP_LOCKED;
 		io_cqring_fill_event(req, 0);
 		io_put_req(req);
 	}
-- 
cgit v1.2.3-59-g8ed1b