From 87ce955b24c9940cb2ca7e5173fcf175578d9fe9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 30 Jan 2020 08:25:34 -0700
Subject: io_uring: add ->show_fdinfo() for the io_uring file descriptor

It can be hard to know exactly what is registered with the ring.
Especially for credentials, it'd be handy to be able to see which
ones are registered, what personalities they have, and what the ID
of each of them is.

This adds support for showing information registered in the ring from
the fdinfo of the io_uring fd. Here's an example from a test case that
registers 4 files (two of them sparse), 4 buffers, and 2 personalities:

pos:	0
flags:	02000002
mnt_id:	14
UserFiles:	4
    0: file-no-1
    1: file-no-2
    2: <none>
    3: <none>
UserBufs:	4
    0: 0x563817c46000/128
    1: 0x563817c47000/256
    2: 0x563817c48000/512
    3: 0x563817c49000/1024
Personalities:
    1
	Uid:	0		0		0		0
	Gid:	0		0		0		0
	Groups:	0
	CapEff:	0000003fffffffff
    2
	Uid:	0		0		0		0
	Gid:	0		0		0		0
	Groups:	0
	CapEff:	0000003fffffffff

Suggested-by: Jann Horn <jannh@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index ac5340fdcdfe..81a787ba6960 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6501,6 +6501,80 @@ out_fput:
 	return submitted ? submitted : ret;
 }
 
+static int io_uring_show_cred(int id, void *p, void *data)
+{
+	const struct cred *cred = p;
+	struct seq_file *m = data;
+	struct user_namespace *uns = seq_user_ns(m);
+	struct group_info *gi;
+	kernel_cap_t cap;
+	unsigned __capi;
+	int g;
+
+	seq_printf(m, "%5d\n", id);
+	seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
+	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
+	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
+	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
+	seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
+	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
+	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
+	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
+	seq_puts(m, "\n\tGroups:\t");
+	gi = cred->group_info;
+	for (g = 0; g < gi->ngroups; g++) {
+		seq_put_decimal_ull(m, g ? " " : "",
+					from_kgid_munged(uns, gi->gid[g]));
+	}
+	seq_puts(m, "\n\tCapEff:\t");
+	cap = cred->cap_effective;
+	CAP_FOR_EACH_U32(__capi)
+		seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
+{
+	int i;
+
+	mutex_lock(&ctx->uring_lock);
+	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
+	for (i = 0; i < ctx->nr_user_files; i++) {
+		struct fixed_file_table *table;
+		struct file *f;
+
+		table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
+		f = table->files[i & IORING_FILE_TABLE_MASK];
+		if (f)
+			seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
+		else
+			seq_printf(m, "%5u: <none>\n", i);
+	}
+	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
+	for (i = 0; i < ctx->nr_user_bufs; i++) {
+		struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
+
+		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
+						(unsigned int) buf->len);
+	}
+	if (!idr_is_empty(&ctx->personality_idr)) {
+		seq_printf(m, "Personalities:\n");
+		idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
+	}
+	mutex_unlock(&ctx->uring_lock);
+}
+
+static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	struct io_ring_ctx *ctx = f->private_data;
+
+	if (percpu_ref_tryget(&ctx->refs)) {
+		__io_uring_show_fdinfo(ctx, m);
+		percpu_ref_put(&ctx->refs);
+	}
+}
+
 static const struct file_operations io_uring_fops = {
 	.release	= io_uring_release,
 	.flush		= io_uring_flush,
@@ -6511,6 +6585,7 @@ static const struct file_operations io_uring_fops = {
 #endif
 	.poll		= io_uring_poll,
 	.fasync		= io_uring_fasync,
+	.show_fdinfo	= io_uring_show_fdinfo,
 };
 
 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
-- 
cgit v1.2.3-59-g8ed1b


From d7f62e825fd19202a0749d10fb439714c51f67d2 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Wed, 29 Jan 2020 14:39:41 +0100
Subject: io_uring: add BUILD_BUG_ON() to assert the layout of struct
 io_uring_sqe

With nesting of anonymous unions and structs it's hard to
review layout changes. It's better to ask the compiler
for these things.

Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 81a787ba6960..217721c7bc41 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7038,6 +7038,39 @@ out_fput:
 
 static int __init io_uring_init(void)
 {
+#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
+	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
+	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
+} while (0)
+
+#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
+	__BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
+	BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
+	BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
+	BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
+	BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
+	BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
+	BUILD_BUG_SQE_ELEM(8,  __u64,  off);
+	BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
+	BUILD_BUG_SQE_ELEM(16, __u64,  addr);
+	BUILD_BUG_SQE_ELEM(24, __u32,  len);
+	BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
+	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
+	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
+	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
+	BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
+	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
+	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
+	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
+	BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
+	BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
+	BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
+	BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
+	BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
+	BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
+	BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
+	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
+
 	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From b5e683d5cab8cd433b06ae178621f083cabd4f63 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 2 Feb 2020 08:23:03 -0700
Subject: eventfd: track eventfd_signal() recursion depth

eventfd use cases from aio and io_uring can deadlock due to circular
or resursive calling, when eventfd_signal() tries to grab the waitqueue
lock. On top of that, it's also possible to construct notification
chains that are deep enough that we could blow the stack.

Add a percpu counter that tracks the percpu recursion depth, warn if we
exceed it. The counter is also exposed so that users of eventfd_signal()
can do the right thing if it's non-zero in the context where it is
called.

Cc: stable@vger.kernel.org # 4.19+
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/eventfd.c            | 15 +++++++++++++++
 include/linux/eventfd.h | 14 ++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 8aa0ea8c55e8..78e41c7c3d05 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -24,6 +24,8 @@
 #include <linux/seq_file.h>
 #include <linux/idr.h>
 
+DEFINE_PER_CPU(int, eventfd_wake_count);
+
 static DEFINE_IDA(eventfd_ida);
 
 struct eventfd_ctx {
@@ -60,12 +62,25 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
 {
 	unsigned long flags;
 
+	/*
+	 * Deadlock or stack overflow issues can happen if we recurse here
+	 * through waitqueue wakeup handlers. If the caller users potentially
+	 * nested waitqueues with custom wakeup handlers, then it should
+	 * check eventfd_signal_count() before calling this function. If
+	 * it returns true, the eventfd_signal() call should be deferred to a
+	 * safe context.
+	 */
+	if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+		return 0;
+
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
+	this_cpu_inc(eventfd_wake_count);
 	if (ULLONG_MAX - ctx->count < n)
 		n = ULLONG_MAX - ctx->count;
 	ctx->count += n;
 	if (waitqueue_active(&ctx->wqh))
 		wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+	this_cpu_dec(eventfd_wake_count);
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
 	return n;
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index ffcc7724ca21..dc4fd8a6644d 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -12,6 +12,8 @@
 #include <linux/fcntl.h>
 #include <linux/wait.h>
 #include <linux/err.h>
+#include <linux/percpu-defs.h>
+#include <linux/percpu.h>
 
 /*
  * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
@@ -40,6 +42,13 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt);
 
+DECLARE_PER_CPU(int, eventfd_wake_count);
+
+static inline bool eventfd_signal_count(void)
+{
+	return this_cpu_read(eventfd_wake_count);
+}
+
 #else /* CONFIG_EVENTFD */
 
 /*
@@ -68,6 +77,11 @@ static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
 	return -ENOSYS;
 }
 
+static inline bool eventfd_signal_count(void)
+{
+	return false;
+}
+
 #endif
 
 #endif /* _LINUX_EVENTFD_H */
-- 
cgit v1.2.3-59-g8ed1b


From f0b493e6b9a8959356983f57112229e69c2f7b8c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 1 Feb 2020 21:30:11 -0700
Subject: io_uring: prevent potential eventfd recursion on poll

If we have nested or circular eventfd wakeups, then we can deadlock if
we run them inline from our poll waitqueue wakeup handler. It's also
possible to have very long chains of notifications, to the extent where
we could risk blowing the stack.

Check the eventfd recursion count before calling eventfd_signal(). If
it's non-zero, then punt the signaling to async context. This is always
safe, as it takes us out-of-line in terms of stack and locking context.

Cc: stable@vger.kernel.org # 5.1+
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 217721c7bc41..43f3b7d90299 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1020,21 +1020,28 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 
 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
 {
+	if (!ctx->cq_ev_fd)
+		return false;
 	if (!ctx->eventfd_async)
 		return true;
 	return io_wq_current_is_worker() || in_interrupt();
 }
 
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev)
 {
 	if (waitqueue_active(&ctx->wait))
 		wake_up(&ctx->wait);
 	if (waitqueue_active(&ctx->sqo_wait))
 		wake_up(&ctx->sqo_wait);
-	if (ctx->cq_ev_fd && io_should_trigger_evfd(ctx))
+	if (trigger_ev)
 		eventfd_signal(ctx->cq_ev_fd, 1);
 }
 
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+	__io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx));
+}
+
 /* Returns true if there are no backlogged entries after the flush */
 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 {
@@ -3561,6 +3568,14 @@ static void io_poll_flush(struct io_wq_work **workptr)
 		__io_poll_flush(req->ctx, nodes);
 }
 
+static void io_poll_trigger_evfd(struct io_wq_work **workptr)
+{
+	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+
+	eventfd_signal(req->ctx->cq_ev_fd, 1);
+	io_put_req(req);
+}
+
 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 			void *key)
 {
@@ -3586,14 +3601,22 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 
 		if (llist_empty(&ctx->poll_llist) &&
 		    spin_trylock_irqsave(&ctx->completion_lock, flags)) {
+			bool trigger_ev;
+
 			hash_del(&req->hash_node);
 			io_poll_complete(req, mask, 0);
-			req->flags |= REQ_F_COMP_LOCKED;
-			io_put_req(req);
-			spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
-			io_cqring_ev_posted(ctx);
-			req = NULL;
+			trigger_ev = io_should_trigger_evfd(ctx);
+			if (trigger_ev && eventfd_signal_count()) {
+				trigger_ev = false;
+				req->work.func = io_poll_trigger_evfd;
+			} else {
+				req->flags |= REQ_F_COMP_LOCKED;
+				io_put_req(req);
+				req = NULL;
+			}
+			spin_unlock_irqrestore(&ctx->completion_lock, flags);
+			__io_cqring_ev_posted(ctx, trigger_ev);
 		} else {
 			req->result = mask;
 			req->llist_node.next = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 0b7b21e42ba2d6ac9595a4358a9354249605a3af Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 31 Jan 2020 08:34:59 -0700
Subject: io_uring: use the proper helpers for io_send/recv
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Don't use the recvmsg/sendmsg helpers, use the same helpers that the
recv(2) and send(2) system calls use.

Reported-by: 李通洲 <carter.li@eoitek.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 43f3b7d90299..cd07df2afe61 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3090,7 +3090,8 @@ static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
 		else if (force_nonblock)
 			flags |= MSG_DONTWAIT;
 
-		ret = __sys_sendmsg_sock(sock, &msg, flags);
+		msg.msg_flags = flags;
+		ret = sock_sendmsg(sock, &msg);
 		if (force_nonblock && ret == -EAGAIN)
 			return -EAGAIN;
 		if (ret == -ERESTARTSYS)
@@ -3116,6 +3117,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
 
 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
 	sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	sr->len = READ_ONCE(sqe->len);
 
 	if (!io || req->opcode == IORING_OP_RECV)
 		return 0;
@@ -3234,7 +3236,7 @@ static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
 		else if (force_nonblock)
 			flags |= MSG_DONTWAIT;
 
-		ret = __sys_recvmsg_sock(sock, &msg, NULL, NULL, flags);
+		ret = sock_recvmsg(sock, &msg, flags);
 		if (force_nonblock && ret == -EAGAIN)
 			return -EAGAIN;
 		if (ret == -ERESTARTSYS)
-- 
cgit v1.2.3-59-g8ed1b


From 5d204bcfa09330972ad3428a8f81c23f371d3e6d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 31 Jan 2020 12:06:52 -0700
Subject: io_uring: don't map read/write iovec potentially twice

If we have a read/write that is deferred, we already setup the async IO
context for that request, and mapped it. When we later try and execute
the request and we get -EAGAIN, we don't want to attempt to re-map it.
If we do, we end up with garbage in the iovec, which typically leads
to an -EFAULT or -EINVAL completion.

Cc: stable@vger.kernel.org # 5.5
Reported-by: Dan Melnic <dmm@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cd07df2afe61..678a1b245e10 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2171,10 +2171,12 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
 {
 	if (!io_op_defs[req->opcode].async_ctx)
 		return 0;
-	if (!req->io && io_alloc_async_ctx(req))
-		return -ENOMEM;
+	if (!req->io) {
+		if (io_alloc_async_ctx(req))
+			return -ENOMEM;
 
-	io_req_map_rw(req, io_size, iovec, fast_iov, iter);
+		io_req_map_rw(req, io_size, iovec, fast_iov, iter);
+	}
 	req->work.func = io_rw_async;
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 9250f9ee194dc3dcee28a42a1533fa2cc0edd215 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 1 Feb 2020 01:22:08 +0300
Subject: io_uring: remove extra ->file check

It won't ever get into io_prep_rw() when req->file haven't been set in
io_req_set_file(), hence remove the check.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 678a1b245e10..a3c75aff8d10 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1862,9 +1862,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	unsigned ioprio;
 	int ret;
 
-	if (!req->file)
-		return -EBADF;
-
 	if (S_ISREG(file_inode(req->file)->i_mode))
 		req->flags |= REQ_F_ISREG;
 
-- 
cgit v1.2.3-59-g8ed1b


From 1a417f4e618e05fba29ba222f1e8555c302376ce Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 31 Jan 2020 17:16:48 -0700
Subject: io_uring: fix sporadic double CQE entry for close

We punt close to async for the final fput(), but we log the completion
even before that even in that case. We rely on the request not having
a files table assigned to detect what the final async close should do.
However, if we punt the async queue to __io_queue_sqe(), we'll get
->files assigned and this makes io_close_finish() think it should both
close the filp again (which does no harm) AND log a new CQE event for
this request. This causes duplicate CQEs.

Queue the request up for async manually so we don't grab files
needlessly and trigger this condition.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a3c75aff8d10..b9c9e04cc1cb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2843,16 +2843,13 @@ static void io_close_finish(struct io_wq_work **workptr)
 		int ret;
 
 		ret = filp_close(req->close.put_file, req->work.files);
-		if (ret < 0) {
+		if (ret < 0)
 			req_set_fail_links(req);
-		}
 		io_cqring_add_event(req, ret);
 	}
 
 	fput(req->close.put_file);
 
-	/* we bypassed the re-issue, drop the submission reference */
-	io_put_req(req);
 	io_put_req_find_next(req, &nxt);
 	if (nxt)
 		io_wq_assign_next(workptr, nxt);
@@ -2894,7 +2891,13 @@ static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
 
 eagain:
 	req->work.func = io_close_finish;
-	return -EAGAIN;
+	/*
+	 * Do manual async queue here to avoid grabbing files - we don't
+	 * need the files, and it'll cause io_close_finish() to close
+	 * the file again and cause a double CQE entry for this request
+	 */
+	io_queue_async_work(req);
+	return 0;
 }
 
 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-- 
cgit v1.2.3-59-g8ed1b


From 3e69426da2599677ebbe76e2d97a606c4797bd74 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 1 Feb 2020 09:22:49 -0700
Subject: io_uring: punt even fadvise() WILLNEED to async context

Andres correctly points out that read-ahead can block, if it needs to
read in meta data (or even just through the page cache page allocations).
Play it safe for now and just ensure WILLNEED is also punted to async
context.

While in there, allow the file settings hints from non-blocking
context. They don't need to start/do IO, and we can safely do them
inline.

Fixes: 4840e418c2fc ("io_uring: add IORING_OP_FADVISE")
Reported-by: Andres Freund <andres@anarazel.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b9c9e04cc1cb..1580f1e7ba1c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2730,9 +2730,16 @@ static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
 	struct io_fadvise *fa = &req->fadvise;
 	int ret;
 
-	/* DONTNEED may block, others _should_ not */
-	if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock)
-		return -EAGAIN;
+	if (force_nonblock) {
+		switch (fa->advice) {
+		case POSIX_FADV_NORMAL:
+		case POSIX_FADV_RANDOM:
+		case POSIX_FADV_SEQUENTIAL:
+			break;
+		default:
+			return -EAGAIN;
+		}
+	}
 
 	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
 	if (ret < 0)
-- 
cgit v1.2.3-59-g8ed1b


From 6c8a31346925cbb373f84a18428ab3df59d3950e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 1 Feb 2020 03:58:00 +0300
Subject: io_uring: iterate req cache backwards

Grab requests from cache-array from the end, so can get by only
free_reqs.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1580f1e7ba1c..42b45c57c3f8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -585,8 +585,7 @@ struct io_submit_state {
 	 * io_kiocb alloc cache
 	 */
 	void			*reqs[IO_IOPOLL_BATCH];
-	unsigned		int free_reqs;
-	unsigned		int cur_req;
+	unsigned int		free_reqs;
 
 	/*
 	 * File reference cache
@@ -1190,12 +1189,10 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 			ret = 1;
 		}
 		state->free_reqs = ret - 1;
-		state->cur_req = 1;
-		req = state->reqs[0];
+		req = state->reqs[ret - 1];
 	} else {
-		req = state->reqs[state->cur_req];
 		state->free_reqs--;
-		state->cur_req++;
+		req = state->reqs[state->free_reqs];
 	}
 
 got_it:
@@ -4849,8 +4846,7 @@ static void io_submit_state_end(struct io_submit_state *state)
 	blk_finish_plug(&state->plug);
 	io_file_put(state);
 	if (state->free_reqs)
-		kmem_cache_free_bulk(req_cachep, state->free_reqs,
-					&state->reqs[state->cur_req]);
+		kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 3e577dcd73a1fdc641bf45e5ea4a37869de221b5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 1 Feb 2020 03:58:42 +0300
Subject: io_uring: put the flag changing code in the same spot

Both iocb_flags() and kiocb_set_rw_flags() are inline and modify
kiocb->ki_flags. Place them close, so they can be potentially better
optimised.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 42b45c57c3f8..edb00ae2619b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1867,8 +1867,11 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		req->flags |= REQ_F_CUR_POS;
 		kiocb->ki_pos = req->file->f_pos;
 	}
-	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
 	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
+	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
+	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
+	if (unlikely(ret))
+		return ret;
 
 	ioprio = READ_ONCE(sqe->ioprio);
 	if (ioprio) {
@@ -1880,10 +1883,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	} else
 		kiocb->ki_ioprio = get_current_ioprio();
 
-	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
-	if (unlikely(ret))
-		return ret;
-
 	/* don't allow async punt if RWF_NOWAIT was requested */
 	if ((kiocb->ki_flags & IOCB_NOWAIT) ||
 	    (req->file->f_flags & O_NONBLOCK))
-- 
cgit v1.2.3-59-g8ed1b


From 01d7a356872eec22ef34a33a5f9cfa917d145468 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 3 Feb 2020 10:33:42 -0700
Subject: aio: prevent potential eventfd recursion on poll

If we have nested or circular eventfd wakeups, then we can deadlock if
we run them inline from our poll waitqueue wakeup handler. It's also
possible to have very long chains of notifications, to the extent where
we could risk blowing the stack.

Check the eventfd recursion count before calling eventfd_signal(). If
it's non-zero, then punt the signaling to async context. This is always
safe, as it takes us out-of-line in terms of stack and locking context.

Cc: stable@vger.kernel.org # 4.19+
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/aio.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index a9fbad2ce5e6..5f3d3d814928 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1610,6 +1610,14 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
 	return 0;
 }
 
+static void aio_poll_put_work(struct work_struct *work)
+{
+	struct poll_iocb *req = container_of(work, struct poll_iocb, work);
+	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
+
+	iocb_put(iocb);
+}
+
 static void aio_poll_complete_work(struct work_struct *work)
 {
 	struct poll_iocb *req = container_of(work, struct poll_iocb, work);
@@ -1674,6 +1682,8 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	list_del_init(&req->wait.entry);
 
 	if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
+		struct kioctx *ctx = iocb->ki_ctx;
+
 		/*
 		 * Try to complete the iocb inline if we can. Use
 		 * irqsave/irqrestore because not all filesystems (e.g. fuse)
@@ -1683,8 +1693,14 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 		list_del(&iocb->ki_list);
 		iocb->ki_res.res = mangle_poll(mask);
 		req->done = true;
-		spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
-		iocb_put(iocb);
+		if (iocb->ki_eventfd && eventfd_signal_count()) {
+			iocb = NULL;
+			INIT_WORK(&req->work, aio_poll_put_work);
+			schedule_work(&req->work);
+		}
+		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+		if (iocb)
+			iocb_put(iocb);
 	} else {
 		schedule_work(&req->work);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From df069d80c8e38c19531c392322e9a16617475c44 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 4 Feb 2020 16:48:34 -0700
Subject: io_uring: spin for sq thread to idle on shutdown

As part of io_uring shutdown, we cancel work that is pending and won't
necessarily complete on its own. That includes requests like poll
commands and timeouts.

If we're using SQPOLL for kernel side submission and we shutdown the
ring immediately after queueing such work, we can race with the sqthread
doing the submission. This means we may miss cancelling some work, which
results in the io_uring shutdown hanging forever.

Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index edb00ae2619b..87f8655656b5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5070,7 +5070,8 @@ static int io_sq_thread(void *data)
 			 * reap events and wake us up.
 			 */
 			if (inflight ||
-			    (!time_after(jiffies, timeout) && ret != -EBUSY)) {
+			    (!time_after(jiffies, timeout) && ret != -EBUSY &&
+			    !percpu_ref_is_dying(&ctx->refs))) {
 				cond_resched();
 				continue;
 			}
@@ -6324,6 +6325,16 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 	percpu_ref_kill(&ctx->refs);
 	mutex_unlock(&ctx->uring_lock);
 
+	/*
+	 * Wait for sq thread to idle, if we have one. It won't spin on new
+	 * work after we've killed the ctx ref above. This is important to do
+	 * before we cancel existing commands, as the thread could otherwise
+	 * be queueing new work post that. If that's work we need to cancel,
+	 * it could cause shutdown to hang.
+	 */
+	while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait))
+		cpu_relax();
+
 	io_kill_timeouts(ctx);
 	io_poll_remove_all(ctx);
 
-- 
cgit v1.2.3-59-g8ed1b


From 2faf852d1be8a4960d328492298da6448cca0279 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 4 Feb 2020 19:54:55 -0700
Subject: io_uring: cleanup fixed file data table references

syzbot reports a use-after-free in io_ring_file_ref_switch() when it
tries to switch back to percpu mode. When we put the final reference to
the table by calling percpu_ref_kill_and_confirm(), we don't want the
zero reference to queue async work for flushing the potentially queued
up items. We currently do a few flush_work(), but they merely paper
around the issue, since the work item may not have been queued yet
depending on the when the percpu-ref callback gets run.

Coming into the file unregister, we know we have the ring quiesced.
io_ring_file_ref_switch() can check for whether or not the ref is dying
or not, and not queue anything async at that point. Once the ref has
been confirmed killed, flush any potential items manually.

Reported-by: syzbot+7caeaea49c2c8a591e3d@syzkaller.appspotmail.com
Fixes: 05f3fb3c5397 ("io_uring: avoid ring quiesce for fixed file set unregister and update")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 87f8655656b5..deff11e84094 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -753,6 +753,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				 struct io_uring_files_update *ip,
 				 unsigned nr_args);
 static int io_grab_files(struct io_kiocb *req);
+static void io_ring_file_ref_flush(struct fixed_file_data *data);
 
 static struct kmem_cache *req_cachep;
 
@@ -5261,15 +5262,10 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	if (!data)
 		return -ENXIO;
 
-	/* protect against inflight atomic switch, which drops the ref */
-	percpu_ref_get(&data->refs);
-	/* wait for existing switches */
-	flush_work(&data->ref_work);
 	percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
-	wait_for_completion(&data->done);
-	percpu_ref_put(&data->refs);
-	/* flush potential new switch */
 	flush_work(&data->ref_work);
+	wait_for_completion(&data->done);
+	io_ring_file_ref_flush(data);
 	percpu_ref_exit(&data->refs);
 
 	__io_sqe_files_unregister(ctx);
@@ -5507,14 +5503,11 @@ struct io_file_put {
 	struct completion *done;
 };
 
-static void io_ring_file_ref_switch(struct work_struct *work)
+static void io_ring_file_ref_flush(struct fixed_file_data *data)
 {
 	struct io_file_put *pfile, *tmp;
-	struct fixed_file_data *data;
 	struct llist_node *node;
 
-	data = container_of(work, struct fixed_file_data, ref_work);
-
 	while ((node = llist_del_all(&data->put_llist)) != NULL) {
 		llist_for_each_entry_safe(pfile, tmp, node, llist) {
 			io_ring_file_put(data->ctx, pfile->file);
@@ -5524,7 +5517,14 @@ static void io_ring_file_ref_switch(struct work_struct *work)
 				kfree(pfile);
 		}
 	}
+}
+
+static void io_ring_file_ref_switch(struct work_struct *work)
+{
+	struct fixed_file_data *data;
 
+	data = container_of(work, struct fixed_file_data, ref_work);
+	io_ring_file_ref_flush(data);
 	percpu_ref_get(&data->refs);
 	percpu_ref_switch_to_percpu(&data->refs);
 }
@@ -5535,8 +5535,14 @@ static void io_file_data_ref_zero(struct percpu_ref *ref)
 
 	data = container_of(ref, struct fixed_file_data, refs);
 
-	/* we can't safely switch from inside this context, punt to wq */
-	queue_work(system_wq, &data->ref_work);
+	/*
+	 * We can't safely switch from inside this context, punt to wq. If
+	 * the table ref is going away, the table is being unregistered.
+	 * Don't queue up the async work for that case, the caller will
+	 * handle it.
+	 */
+	if (!percpu_ref_is_dying(&data->refs))
+		queue_work(system_wq, &data->ref_work);
 }
 
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
-- 
cgit v1.2.3-59-g8ed1b