aboutsummaryrefslogtreecommitdiffstats
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c352
1 files changed, 286 insertions, 66 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4ed4b110a154..e2a66e12fbc6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -231,6 +231,7 @@ struct io_ring_ctx {
struct task_struct *sqo_thread; /* if using sq thread polling */
struct mm_struct *sqo_mm;
wait_queue_head_t sqo_wait;
+ struct completion sqo_thread_started;
struct {
/* CQ ring */
@@ -322,6 +323,7 @@ struct io_kiocb {
struct io_ring_ctx *ctx;
struct list_head list;
+ struct list_head link_list;
unsigned int flags;
refcount_t refs;
#define REQ_F_NOWAIT 1 /* must not punt to workers */
@@ -330,8 +332,10 @@ struct io_kiocb {
#define REQ_F_SEQ_PREV 8 /* sequential with previous */
#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
#define REQ_F_IO_DRAINED 32 /* drain done */
+#define REQ_F_LINK 64 /* linked sqes */
+#define REQ_F_FAIL_LINK 128 /* fail rest of links */
u64 user_data;
- u32 error; /* iopoll result from callback */
+ u32 result;
u32 sequence;
struct work_struct work;
@@ -395,7 +399,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
if (!ctx)
return NULL;
- if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
+ if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
kfree(ctx);
return NULL;
}
@@ -403,6 +408,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ctx->flags = p->flags;
init_waitqueue_head(&ctx->cq_wait);
init_completion(&ctx->ctx_done);
+ init_completion(&ctx->sqo_thread_started);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait);
for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
@@ -584,6 +590,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
req->flags = 0;
/* one is dropped after submission, the other at completion */
refcount_set(&req->refs, 2);
+ req->result = 0;
return req;
out:
io_ring_drop_ctx_refs(ctx, 1);
@@ -599,7 +606,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
}
}
-static void io_free_req(struct io_kiocb *req)
+static void __io_free_req(struct io_kiocb *req)
{
if (req->file && !(req->flags & REQ_F_FIXED_FILE))
fput(req->file);
@@ -607,6 +614,63 @@ static void io_free_req(struct io_kiocb *req)
kmem_cache_free(req_cachep, req);
}
+static void io_req_link_next(struct io_kiocb *req)
+{
+ struct io_kiocb *nxt;
+
+ /*
+ * The list should never be empty when we are called here. But could
+ * potentially happen if the chain is messed up, check to be on the
+ * safe side.
+ */
+ nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
+ if (nxt) {
+ list_del(&nxt->list);
+ if (!list_empty(&req->link_list)) {
+ INIT_LIST_HEAD(&nxt->link_list);
+ list_splice(&req->link_list, &nxt->link_list);
+ nxt->flags |= REQ_F_LINK;
+ }
+
+ INIT_WORK(&nxt->work, io_sq_wq_submit_work);
+ queue_work(req->ctx->sqo_wq, &nxt->work);
+ }
+}
+
+/*
+ * Called if REQ_F_LINK is set, and we fail the head request
+ */
+static void io_fail_links(struct io_kiocb *req)
+{
+ struct io_kiocb *link;
+
+ while (!list_empty(&req->link_list)) {
+ link = list_first_entry(&req->link_list, struct io_kiocb, list);
+ list_del(&link->list);
+
+ io_cqring_add_event(req->ctx, link->user_data, -ECANCELED);
+ __io_free_req(link);
+ }
+}
+
+static void io_free_req(struct io_kiocb *req)
+{
+ /*
+ * If LINK is set, we have dependent requests in this chain. If we
+ * didn't fail this request, queue the first one up, moving any other
+ * dependencies to the next request. In case of failure, fail the rest
+ * of the chain.
+ */
+ if (req->flags & REQ_F_LINK) {
+ if (req->flags & REQ_F_FAIL_LINK)
+ io_fail_links(req);
+ else
+ io_req_link_next(req);
+ }
+
+ __io_free_req(req);
+}
+
static void io_put_req(struct io_kiocb *req)
{
if (refcount_dec_and_test(&req->refs))
@@ -628,16 +692,17 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
req = list_first_entry(done, struct io_kiocb, list);
list_del(&req->list);
- io_cqring_fill_event(ctx, req->user_data, req->error);
+ io_cqring_fill_event(ctx, req->user_data, req->result);
(*nr_events)++;
if (refcount_dec_and_test(&req->refs)) {
/* If we're not using fixed files, we have to pair the
* completion part with the file put. Use regular
* completions for those, only batch free for fixed
- * file.
+ * file and non-linked commands.
*/
- if (req->flags & REQ_F_FIXED_FILE) {
+ if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
+ REQ_F_FIXED_FILE) {
reqs[to_free++] = req;
if (to_free == ARRAY_SIZE(reqs))
io_free_req_many(ctx, reqs, &to_free);
@@ -776,6 +841,8 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
kiocb_end_write(kiocb);
+ if ((req->flags & REQ_F_LINK) && res != req->result)
+ req->flags |= REQ_F_FAIL_LINK;
io_cqring_add_event(req->ctx, req->user_data, res);
io_put_req(req);
}
@@ -786,7 +853,9 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
kiocb_end_write(kiocb);
- req->error = res;
+ if ((req->flags & REQ_F_LINK) && res != req->result)
+ req->flags |= REQ_F_FAIL_LINK;
+ req->result = res;
if (res != -EAGAIN)
req->flags |= REQ_F_IOPOLL_COMPLETED;
}
@@ -929,7 +998,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
!kiocb->ki_filp->f_op->iopoll)
return -EOPNOTSUPP;
- req->error = 0;
kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
} else {
@@ -1001,9 +1069,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
return 0;
}
-static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
- const struct sqe_submit *s, struct iovec **iovec,
- struct iov_iter *iter)
+static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
+ const struct sqe_submit *s, struct iovec **iovec,
+ struct iov_iter *iter)
{
const struct io_uring_sqe *sqe = s->sqe;
void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -1021,7 +1089,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
opcode = READ_ONCE(sqe->opcode);
if (opcode == IORING_OP_READ_FIXED ||
opcode == IORING_OP_WRITE_FIXED) {
- int ret = io_import_fixed(ctx, rw, sqe, iter);
+ ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
*iovec = NULL;
return ret;
}
@@ -1087,7 +1155,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
struct iov_iter iter;
struct file *file;
size_t iov_count;
- int ret;
+ ssize_t read_size, ret;
ret = io_prep_rw(req, s, force_nonblock);
if (ret)
@@ -1100,16 +1168,30 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
return -EINVAL;
ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
- if (ret)
+ if (ret < 0)
return ret;
+ read_size = ret;
+ if (req->flags & REQ_F_LINK)
+ req->result = read_size;
+
iov_count = iov_iter_count(&iter);
ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
if (!ret) {
ssize_t ret2;
- /* Catch -EAGAIN return for forced non-blocking submission */
ret2 = call_read_iter(file, kiocb, &iter);
+ /*
+ * In case of a short read, punt to async. This can happen
+ * if we have data partially cached. Alternatively we can
+ * return the short read, in which case the application will
+ * need to issue another SQE and wait for it. That SQE will
+ * need async punt anyway, so it's more efficient to do it
+ * here.
+ */
+ if (force_nonblock && ret2 > 0 && ret2 < read_size)
+ ret2 = -EAGAIN;
+ /* Catch -EAGAIN return for forced non-blocking submission */
if (!force_nonblock || ret2 != -EAGAIN) {
io_rw_done(kiocb, ret2);
} else {
@@ -1134,7 +1216,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
struct iov_iter iter;
struct file *file;
size_t iov_count;
- int ret;
+ ssize_t ret;
ret = io_prep_rw(req, s, force_nonblock);
if (ret)
@@ -1147,9 +1229,12 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
return -EINVAL;
ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
- if (ret)
+ if (ret < 0)
return ret;
+ if (req->flags & REQ_F_LINK)
+ req->result = ret;
+
iov_count = iov_iter_count(&iter);
ret = -EAGAIN;
@@ -1253,6 +1338,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
end > 0 ? end : LLONG_MAX,
fsync_flags & IORING_FSYNC_DATASYNC);
+ if (ret < 0 && (req->flags & REQ_F_LINK))
+ req->flags |= REQ_F_FAIL_LINK;
io_cqring_add_event(req->ctx, sqe->user_data, ret);
io_put_req(req);
return 0;
@@ -1297,11 +1384,70 @@ static int io_sync_file_range(struct io_kiocb *req,
ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
+ if (ret < 0 && (req->flags & REQ_F_LINK))
+ req->flags |= REQ_F_FAIL_LINK;
io_cqring_add_event(req->ctx, sqe->user_data, ret);
io_put_req(req);
return 0;
}
+#if defined(CONFIG_NET)
+static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ bool force_nonblock,
+ long (*fn)(struct socket *, struct user_msghdr __user *,
+ unsigned int))
+{
+ struct socket *sock;
+ int ret;
+
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+
+ sock = sock_from_file(req->file, &ret);
+ if (sock) {
+ struct user_msghdr __user *msg;
+ unsigned flags;
+
+ flags = READ_ONCE(sqe->msg_flags);
+ if (flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+ else if (force_nonblock)
+ flags |= MSG_DONTWAIT;
+
+ msg = (struct user_msghdr __user *) (unsigned long)
+ READ_ONCE(sqe->addr);
+
+ ret = fn(sock, msg, flags);
+ if (force_nonblock && ret == -EAGAIN)
+ return ret;
+ }
+
+ io_cqring_add_event(req->ctx, sqe->user_data, ret);
+ io_put_req(req);
+ return 0;
+}
+#endif
+
+static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+ return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock);
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+ return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock);
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
static void io_poll_remove_one(struct io_kiocb *req)
{
struct io_poll_iocb *poll = &req->poll;
@@ -1549,9 +1695,10 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
{
int ret, opcode;
+ req->user_data = READ_ONCE(s->sqe->user_data);
+
if (unlikely(s->index >= ctx->sq_entries))
return -EINVAL;
- req->user_data = READ_ONCE(s->sqe->user_data);
opcode = READ_ONCE(s->sqe->opcode);
switch (opcode) {
@@ -1586,6 +1733,12 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
case IORING_OP_SYNC_FILE_RANGE:
ret = io_sync_file_range(req, s->sqe, force_nonblock);
break;
+ case IORING_OP_SENDMSG:
+ ret = io_sendmsg(req, s->sqe, force_nonblock);
+ break;
+ case IORING_OP_RECVMSG:
+ ret = io_recvmsg(req, s->sqe, force_nonblock);
+ break;
default:
ret = -EINVAL;
break;
@@ -1595,7 +1748,7 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
return ret;
if (ctx->flags & IORING_SETUP_IOPOLL) {
- if (req->error == -EAGAIN)
+ if (req->result == -EAGAIN)
return -EAGAIN;
/* workqueue context doesn't hold uring_lock, grab it now */
@@ -1819,31 +1972,11 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
return 0;
}
-static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
- struct io_submit_state *state)
+static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ struct sqe_submit *s)
{
- struct io_kiocb *req;
int ret;
- /* enforce forwards compatibility on users */
- if (unlikely(s->sqe->flags & ~(IOSQE_FIXED_FILE | IOSQE_IO_DRAIN)))
- return -EINVAL;
-
- req = io_get_req(ctx, state);
- if (unlikely(!req))
- return -EAGAIN;
-
- ret = io_req_set_file(ctx, s, state, req);
- if (unlikely(ret))
- goto out;
-
- ret = io_req_defer(ctx, req, s->sqe);
- if (ret) {
- if (ret == -EIOCBQUEUED)
- ret = 0;
- return ret;
- }
-
ret = __io_submit_sqe(ctx, req, s, true);
if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
struct io_uring_sqe *sqe_copy;
@@ -1866,24 +1999,93 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
/*
* Queued up for async execution, worker will release
- * submit reference when the iocb is actually
- * submitted.
+ * submit reference when the iocb is actually submitted.
*/
return 0;
}
}
-out:
/* drop submission reference */
io_put_req(req);
/* and drop final reference, if we failed */
- if (ret)
+ if (ret) {
+ io_cqring_add_event(ctx, req->user_data, ret);
+ if (req->flags & REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
io_put_req(req);
+ }
return ret;
}
+#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
+
+static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
+ struct io_submit_state *state, struct io_kiocb **link)
+{
+ struct io_uring_sqe *sqe_copy;
+ struct io_kiocb *req;
+ int ret;
+
+ /* enforce forwards compatibility on users */
+ if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ req = io_get_req(ctx, state);
+ if (unlikely(!req)) {
+ ret = -EAGAIN;
+ goto err;
+ }
+
+ ret = io_req_set_file(ctx, s, state, req);
+ if (unlikely(ret)) {
+err_req:
+ io_free_req(req);
+err:
+ io_cqring_add_event(ctx, s->sqe->user_data, ret);
+ return;
+ }
+
+ ret = io_req_defer(ctx, req, s->sqe);
+ if (ret) {
+ if (ret != -EIOCBQUEUED)
+ goto err_req;
+ return;
+ }
+
+ /*
+ * If we already have a head request, queue this one for async
+ * submittal once the head completes. If we don't have a head but
+ * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
+ * submitted sync once the chain is complete. If none of those
+ * conditions are true (normal request), then just queue it.
+ */
+ if (*link) {
+ struct io_kiocb *prev = *link;
+
+ sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
+ if (!sqe_copy) {
+ ret = -EAGAIN;
+ goto err_req;
+ }
+
+ s->sqe = sqe_copy;
+ memcpy(&req->submit, s, sizeof(*s));
+ list_add_tail(&req->list, &prev->link_list);
+ } else if (s->sqe->flags & IOSQE_IO_LINK) {
+ req->flags |= REQ_F_LINK;
+
+ memcpy(&req->submit, s, sizeof(*s));
+ INIT_LIST_HEAD(&req->link_list);
+ *link = req;
+ } else {
+ io_queue_sqe(ctx, req, s);
+ }
+}
+
/*
* Batched submission is done, ensure local IO is flushed out.
*/
@@ -1966,7 +2168,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
unsigned int nr, bool has_user, bool mm_fault)
{
struct io_submit_state state, *statep = NULL;
- int ret, i, submitted = 0;
+ struct io_kiocb *link = NULL;
+ bool prev_was_link = false;
+ int i, submitted = 0;
if (nr > IO_PLUG_THRESHOLD) {
io_submit_state_start(&state, ctx, nr);
@@ -1974,22 +2178,30 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
}
for (i = 0; i < nr; i++) {
+ /*
+ * If previous wasn't linked and we have a linked command,
+ * that's the end of the chain. Submit the previous link.
+ */
+ if (!prev_was_link && link) {
+ io_queue_sqe(ctx, link, &link->submit);
+ link = NULL;
+ }
+ prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0;
+
if (unlikely(mm_fault)) {
- ret = -EFAULT;
+ io_cqring_add_event(ctx, sqes[i].sqe->user_data,
+ -EFAULT);
} else {
sqes[i].has_user = has_user;
sqes[i].needs_lock = true;
sqes[i].needs_fixed_file = true;
- ret = io_submit_sqe(ctx, &sqes[i], statep);
- }
- if (!ret) {
+ io_submit_sqe(ctx, &sqes[i], statep, &link);
submitted++;
- continue;
}
-
- io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret);
}
+ if (link)
+ io_queue_sqe(ctx, link, &link->submit);
if (statep)
io_submit_state_end(&state);
@@ -2006,6 +2218,8 @@ static int io_sq_thread(void *data)
unsigned inflight;
unsigned long timeout;
+ complete(&ctx->sqo_thread_started);
+
old_fs = get_fs();
set_fs(USER_DS);
@@ -2130,6 +2344,8 @@ static int io_sq_thread(void *data)
static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
{
struct io_submit_state state, *statep = NULL;
+ struct io_kiocb *link = NULL;
+ bool prev_was_link = false;
int i, submit = 0;
if (to_submit > IO_PLUG_THRESHOLD) {
@@ -2139,22 +2355,30 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
for (i = 0; i < to_submit; i++) {
struct sqe_submit s;
- int ret;
if (!io_get_sqring(ctx, &s))
break;
+ /*
+ * If previous wasn't linked and we have a linked command,
+ * that's the end of the chain. Submit the previous link.
+ */
+ if (!prev_was_link && link) {
+ io_queue_sqe(ctx, link, &link->submit);
+ link = NULL;
+ }
+ prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
+
s.has_user = true;
s.needs_lock = false;
s.needs_fixed_file = false;
submit++;
-
- ret = io_submit_sqe(ctx, &s, statep);
- if (ret)
- io_cqring_add_event(ctx, s.sqe->user_data, ret);
+ io_submit_sqe(ctx, &s, statep, &link);
}
io_commit_sqring(ctx);
+ if (link)
+ io_queue_sqe(ctx, link, &link->submit);
if (statep)
io_submit_state_end(statep);
@@ -2176,7 +2400,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
const sigset_t __user *sig, size_t sigsz)
{
struct io_cq_ring *ring = ctx->cq_ring;
- sigset_t ksigmask, sigsaved;
int ret;
if (io_cqring_events(ring) >= min_events)
@@ -2186,21 +2409,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
- &ksigmask, &sigsaved, sigsz);
+ sigsz);
else
#endif
- ret = set_user_sigmask(sig, &ksigmask,
- &sigsaved, sigsz);
+ ret = set_user_sigmask(sig, sigsz);
if (ret)
return ret;
}
ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events);
-
- if (sig)
- restore_user_sigmask(sig, &sigsaved, ret == -ERESTARTSYS);
-
+ restore_saved_sigmask_unless(ret == -ERESTARTSYS);
if (ret == -ERESTARTSYS)
ret = -EINTR;
@@ -2240,6 +2459,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
static void io_sq_thread_stop(struct io_ring_ctx *ctx)
{
if (ctx->sqo_thread) {
+ wait_for_completion(&ctx->sqo_thread_started);
/*
* The park is a bit of a work-around, without it we get
* warning spews on shutdown with SQPOLL set and affinity