aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-11-25 10:40:27 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2019-11-25 10:40:27 -0800
commitfb4b3d3fd0c7f53168b6f6d07d1d97f55c676eeb (patch)
treecc059d8f729fd93013e44b31a9e3d0f781d51d9a /include
parentMerge tag 'tpmdd-next-20191112' of git://git.infradead.org/users/jjs/linux-tpmdd (diff)
parentio_uring: make POLL_ADD/POLL_REMOVE scale better (diff)
downloadlinux-dev-fb4b3d3fd0c7f53168b6f6d07d1d97f55c676eeb.tar.xz
linux-dev-fb4b3d3fd0c7f53168b6f6d07d1d97f55c676eeb.zip
Merge tag 'for-5.5/io_uring-20191121' of git://git.kernel.dk/linux-block
Pull io_uring updates from Jens Axboe: "A lot of stuff has been going on this cycle, with improving the support for networked IO (and hence unbounded request completion times) being one of the major themes. There's been a set of fixes done this week, I'll send those out as well once we're certain we're fully happy with them. This contains: - Unification of the "normal" submit path and the SQPOLL path (Pavel) - Support for sparse (and bigger) file sets, and updating of those file sets without needing to unregister/register again. - Independently sized CQ ring, instead of just making it always 2x the SQ ring size. This makes it more flexible for networked applications. - Support for overflowed CQ ring, never dropping events but providing backpressure on submits. - Add support for absolute timeouts, not just relative ones. - Support for generic cancellations. This divorces io_uring from workqueues as well, which additionally gets us one step closer to generic async system call support. - With cancellations, we can support grabbing the process file table as well, just like we do mm context. This allows support for system calls that create file descriptors, like accept4() support that's built on top of that. - Support for io_uring tracing (Dmitrii) - Support for linked timeouts. These abort an operation if it isn't completed by the time noted in the linke timeout. - Speedup tracking of poll requests - Various cleanups making the coder easier to follow (Jackie, Pavel, Bob, YueHaibing, me) - Update MAINTAINERS with new io_uring list" * tag 'for-5.5/io_uring-20191121' of git://git.kernel.dk/linux-block: (64 commits) io_uring: make POLL_ADD/POLL_REMOVE scale better io-wq: remove now redundant struct io_wq_nulls_list io_uring: Fix getting file for non-fd opcodes io_uring: introduce req_need_defer() io_uring: clean up io_uring_cancel_files() io-wq: ensure free/busy list browsing see all items io-wq: ensure we have a stable view of ->cur_work for cancellations io_wq: add get/put_work handlers to io_wq_create() io_uring: check for validity of ->rings in teardown io_uring: fix potential deadlock in io_poll_wake() io_uring: use correct "is IO worker" helper io_uring: fix -ENOENT issue with linked timer with short timeout io_uring: don't do flush cancel under inflight_lock io_uring: flag SQPOLL busy condition to userspace io_uring: make ASYNC_CANCEL work with poll and timeout io_uring: provide fallback request for OOM situations io_uring: convert accept4() -ERESTARTSYS into -EINTR io_uring: fix error clear of ->file_table in io_sqe_files_register() io_uring: separate the io_free_req and io_free_req_find_next interface io_uring: keep io_put_req only responsible for release and put req ...
Diffstat (limited to 'include')
-rw-r--r--include/Kbuild1
-rw-r--r--include/linux/sched.h1
-rw-r--r--include/linux/socket.h3
-rw-r--r--include/trace/events/io_uring.h358
-rw-r--r--include/uapi/linux/io_uring.h24
5 files changed, 386 insertions, 1 deletions
diff --git a/include/Kbuild b/include/Kbuild
index 6f9ec5a64ec5..95508049ee51 100644
--- a/include/Kbuild
+++ b/include/Kbuild
@@ -1027,6 +1027,7 @@ header-test- += trace/events/fsi_master_gpio.h
header-test- += trace/events/huge_memory.h
header-test- += trace/events/ib_mad.h
header-test- += trace/events/ib_umad.h
+header-test- += trace/events/io_uring.h
header-test- += trace/events/iscsi.h
header-test- += trace/events/jbd2.h
header-test- += trace/events/kvm.h
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 67a1d86981a9..6666e25606b7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1468,6 +1468,7 @@ extern struct pid *cad_pid;
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */
+#define PF_IO_WORKER 0x20000000 /* Task is an IO worker */
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 4049d9755cf1..09c32a21555b 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -392,6 +392,9 @@ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
extern int __sys_sendto(int fd, void __user *buff, size_t len,
unsigned int flags, struct sockaddr __user *addr,
int addr_len);
+extern int __sys_accept4_file(struct file *file, unsigned file_flags,
+ struct sockaddr __user *upeer_sockaddr,
+ int __user *upeer_addrlen, int flags);
extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags);
extern int __sys_socket(int family, int type, int protocol);
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
new file mode 100644
index 000000000000..72a4d0174b02
--- /dev/null
+++ b/include/trace/events/io_uring.h
@@ -0,0 +1,358 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM io_uring
+
+#if !defined(_TRACE_IO_URING_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IO_URING_H
+
+#include <linux/tracepoint.h>
+
+struct io_wq_work;
+
+/**
+ * io_uring_create - called after a new io_uring context was prepared
+ *
+ * @fd: corresponding file descriptor
+ * @ctx: pointer to a ring context structure
+ * @sq_entries: actual SQ size
+ * @cq_entries: actual CQ size
+ * @flags: SQ ring flags, provided to io_uring_setup(2)
+ *
+ * Allows to trace io_uring creation and provide pointer to a context, that can
+ * be used later to find correlated events.
+ */
+TRACE_EVENT(io_uring_create,
+
+ TP_PROTO(int fd, void *ctx, u32 sq_entries, u32 cq_entries, u32 flags),
+
+ TP_ARGS(fd, ctx, sq_entries, cq_entries, flags),
+
+ TP_STRUCT__entry (
+ __field( int, fd )
+ __field( void *, ctx )
+ __field( u32, sq_entries )
+ __field( u32, cq_entries )
+ __field( u32, flags )
+ ),
+
+ TP_fast_assign(
+ __entry->fd = fd;
+ __entry->ctx = ctx;
+ __entry->sq_entries = sq_entries;
+ __entry->cq_entries = cq_entries;
+ __entry->flags = flags;
+ ),
+
+ TP_printk("ring %p, fd %d sq size %d, cq size %d, flags %d",
+ __entry->ctx, __entry->fd, __entry->sq_entries,
+ __entry->cq_entries, __entry->flags)
+);
+
+/**
+ * io_uring_register - called after a buffer/file/eventfd was succesfully
+ * registered for a ring
+ *
+ * @ctx: pointer to a ring context structure
+ * @opcode: describes which operation to perform
+ * @nr_user_files: number of registered files
+ * @nr_user_bufs: number of registered buffers
+ * @cq_ev_fd: whether eventfs registered or not
+ * @ret: return code
+ *
+ * Allows to trace fixed files/buffers/eventfds, that could be registered to
+ * avoid an overhead of getting references to them for every operation. This
+ * event, together with io_uring_file_get, can provide a full picture of how
+ * much overhead one can reduce via fixing.
+ */
+TRACE_EVENT(io_uring_register,
+
+ TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files,
+ unsigned nr_bufs, bool eventfd, long ret),
+
+ TP_ARGS(ctx, opcode, nr_files, nr_bufs, eventfd, ret),
+
+ TP_STRUCT__entry (
+ __field( void *, ctx )
+ __field( unsigned, opcode )
+ __field( unsigned, nr_files )
+ __field( unsigned, nr_bufs )
+ __field( bool, eventfd )
+ __field( long, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->ctx = ctx;
+ __entry->opcode = opcode;
+ __entry->nr_files = nr_files;
+ __entry->nr_bufs = nr_bufs;
+ __entry->eventfd = eventfd;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, "
+ "eventfd %d, ret %ld",
+ __entry->ctx, __entry->opcode, __entry->nr_files,
+ __entry->nr_bufs, __entry->eventfd, __entry->ret)
+);
+
+/**
+ * io_uring_file_get - called before getting references to an SQE file
+ *
+ * @ctx: pointer to a ring context structure
+ * @fd: SQE file descriptor
+ *
+ * Allows to trace out how often an SQE file reference is obtained, which can
+ * help figuring out if it makes sense to use fixed files, or check that fixed
+ * files are used correctly.
+ */
+TRACE_EVENT(io_uring_file_get,
+
+ TP_PROTO(void *ctx, int fd),
+
+ TP_ARGS(ctx, fd),
+
+ TP_STRUCT__entry (
+ __field( void *, ctx )
+ __field( int, fd )
+ ),
+
+ TP_fast_assign(
+ __entry->ctx = ctx;
+ __entry->fd = fd;
+ ),
+
+ TP_printk("ring %p, fd %d", __entry->ctx, __entry->fd)
+);
+
+/**
+ * io_uring_queue_async_work - called before submitting a new async work
+ *
+ * @ctx: pointer to a ring context structure
+ * @hashed: type of workqueue, hashed or normal
+ * @req: pointer to a submitted request
+ * @work: pointer to a submitted io_wq_work
+ *
+ * Allows to trace asynchronous work submission.
+ */
+TRACE_EVENT(io_uring_queue_async_work,
+
+ TP_PROTO(void *ctx, int rw, void * req, struct io_wq_work *work,
+ unsigned int flags),
+
+ TP_ARGS(ctx, rw, req, work, flags),
+
+ TP_STRUCT__entry (
+ __field( void *, ctx )
+ __field( int, rw )
+ __field( void *, req )
+ __field( struct io_wq_work *, work )
+ __field( unsigned int, flags )
+ ),
+
+ TP_fast_assign(
+ __entry->ctx = ctx;
+ __entry->rw = rw;
+ __entry->req = req;
+ __entry->work = work;
+ __entry->flags = flags;
+ ),
+
+ TP_printk("ring %p, request %p, flags %d, %s queue, work %p",
+ __entry->ctx, __entry->req, __entry->flags,
+ __entry->rw ? "hashed" : "normal", __entry->work)
+);
+
+/**
+ * io_uring_defer_list - called before the io_uring work added into defer_list
+ *
+ * @ctx: pointer to a ring context structure
+ * @req: pointer to a deferred request
+ * @shadow: whether request is shadow or not
+ *
+ * Allows to track deferred requests, to get an insight about what requests are
+ * not started immediately.
+ */
+TRACE_EVENT(io_uring_defer,
+
+ TP_PROTO(void *ctx, void *req, bool shadow),
+
+ TP_ARGS(ctx, req, shadow),
+
+ TP_STRUCT__entry (
+ __field( void *, ctx )
+ __field( void *, req )
+ __field( bool, shadow )
+ ),
+
+ TP_fast_assign(
+ __entry->ctx = ctx;
+ __entry->req = req;
+ __entry->shadow = shadow;
+ ),
+
+ TP_printk("ring %p, request %p%s", __entry->ctx, __entry->req,
+ __entry->shadow ? ", shadow": "")
+);
+
+/**
+ * io_uring_link - called before the io_uring request added into link_list of
+ * another request
+ *
+ * @ctx: pointer to a ring context structure
+ * @req: pointer to a linked request
+ * @target_req: pointer to a previous request, that would contain @req
+ *
+ * Allows to track linked requests, to understand dependencies between requests
+ * and how does it influence their execution flow.
+ */
+TRACE_EVENT(io_uring_link,
+
+ TP_PROTO(void *ctx, void *req, void *target_req),
+
+ TP_ARGS(ctx, req, target_req),
+
+ TP_STRUCT__entry (
+ __field( void *, ctx )
+ __field( void *, req )
+ __field( void *, target_req )
+ ),
+
+ TP_fast_assign(
+ __entry->ctx = ctx;
+ __entry->req = req;
+ __entry->target_req = target_req;
+ ),
+
+ TP_printk("ring %p, request %p linked after %p",
+ __entry->ctx, __entry->req, __entry->target_req)
+);
+
+/**
+ * io_uring_cqring_wait - called before start waiting for an available CQE
+ *
+ * @ctx: pointer to a ring context structure
+ * @min_events: minimal number of events to wait for
+ *
+ * Allows to track waiting for CQE, so that we can e.g. troubleshoot
+ * situations, when an application wants to wait for an event, that never
+ * comes.
+ */
+TRACE_EVENT(io_uring_cqring_wait,
+
+ TP_PROTO(void *ctx, int min_events),
+
+ TP_ARGS(ctx, min_events),
+
+ TP_STRUCT__entry (
+ __field( void *, ctx )
+ __field( int, min_events )
+ ),
+
+ TP_fast_assign(
+ __entry->ctx = ctx;
+ __entry->min_events = min_events;
+ ),
+
+ TP_printk("ring %p, min_events %d", __entry->ctx, __entry->min_events)
+);
+
+/**
+ * io_uring_fail_link - called before failing a linked request
+ *
+ * @req: request, which links were cancelled
+ * @link: cancelled link
+ *
+ * Allows to track linked requests cancellation, to see not only that some work
+ * was cancelled, but also which request was the reason.
+ */
+TRACE_EVENT(io_uring_fail_link,
+
+ TP_PROTO(void *req, void *link),
+
+ TP_ARGS(req, link),
+
+ TP_STRUCT__entry (
+ __field( void *, req )
+ __field( void *, link )
+ ),
+
+ TP_fast_assign(
+ __entry->req = req;
+ __entry->link = link;
+ ),
+
+ TP_printk("request %p, link %p", __entry->req, __entry->link)
+);
+
+/**
+ * io_uring_complete - called when completing an SQE
+ *
+ * @ctx: pointer to a ring context structure
+ * @user_data: user data associated with the request
+ * @res: result of the request
+ *
+ */
+TRACE_EVENT(io_uring_complete,
+
+ TP_PROTO(void *ctx, u64 user_data, long res),
+
+ TP_ARGS(ctx, user_data, res),
+
+ TP_STRUCT__entry (
+ __field( void *, ctx )
+ __field( u64, user_data )
+ __field( long, res )
+ ),
+
+ TP_fast_assign(
+ __entry->ctx = ctx;
+ __entry->user_data = user_data;
+ __entry->res = res;
+ ),
+
+ TP_printk("ring %p, user_data 0x%llx, result %ld",
+ __entry->ctx, (unsigned long long)__entry->user_data,
+ __entry->res)
+);
+
+
+/**
+ * io_uring_submit_sqe - called before submitting one SQE
+ *
+ * @ctx: pointer to a ring context structure
+ * @user_data: user data associated with the request
+ * @force_nonblock: whether a context blocking or not
+ * @sq_thread: true if sq_thread has submitted this SQE
+ *
+ * Allows to track SQE submitting, to understand what was the source of it, SQ
+ * thread or io_uring_enter call.
+ */
+TRACE_EVENT(io_uring_submit_sqe,
+
+ TP_PROTO(void *ctx, u64 user_data, bool force_nonblock, bool sq_thread),
+
+ TP_ARGS(ctx, user_data, force_nonblock, sq_thread),
+
+ TP_STRUCT__entry (
+ __field( void *, ctx )
+ __field( u64, user_data )
+ __field( bool, force_nonblock )
+ __field( bool, sq_thread )
+ ),
+
+ TP_fast_assign(
+ __entry->ctx = ctx;
+ __entry->user_data = user_data;
+ __entry->force_nonblock = force_nonblock;
+ __entry->sq_thread = sq_thread;
+ ),
+
+ TP_printk("ring %p, user data 0x%llx, non block %d, sq_thread %d",
+ __entry->ctx, (unsigned long long) __entry->user_data,
+ __entry->force_nonblock, __entry->sq_thread)
+);
+
+#endif /* _TRACE_IO_URING_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ea57526a5b89..2a1569211d87 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -19,7 +19,10 @@ struct io_uring_sqe {
__u8 flags; /* IOSQE_ flags */
__u16 ioprio; /* ioprio for the request */
__s32 fd; /* file descriptor to do IO on */
- __u64 off; /* offset into file */
+ union {
+ __u64 off; /* offset into file */
+ __u64 addr2;
+ };
__u64 addr; /* pointer to buffer or iovecs */
__u32 len; /* buffer size or number of iovecs */
union {
@@ -29,6 +32,8 @@ struct io_uring_sqe {
__u32 sync_range_flags;
__u32 msg_flags;
__u32 timeout_flags;
+ __u32 accept_flags;
+ __u32 cancel_flags;
};
__u64 user_data; /* data to be passed back at completion time */
union {
@@ -50,6 +55,7 @@ struct io_uring_sqe {
#define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */
#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */
#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */
+#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */
#define IORING_OP_NOP 0
#define IORING_OP_READV 1
@@ -63,6 +69,10 @@ struct io_uring_sqe {
#define IORING_OP_SENDMSG 9
#define IORING_OP_RECVMSG 10
#define IORING_OP_TIMEOUT 11
+#define IORING_OP_TIMEOUT_REMOVE 12
+#define IORING_OP_ACCEPT 13
+#define IORING_OP_ASYNC_CANCEL 14
+#define IORING_OP_LINK_TIMEOUT 15
/*
* sqe->fsync_flags
@@ -70,6 +80,11 @@ struct io_uring_sqe {
#define IORING_FSYNC_DATASYNC (1U << 0)
/*
+ * sqe->timeout_flags
+ */
+#define IORING_TIMEOUT_ABS (1U << 0)
+
+/*
* IO completion data structure (Completion Queue Entry)
*/
struct io_uring_cqe {
@@ -140,6 +155,7 @@ struct io_uring_params {
* io_uring_params->features flags
*/
#define IORING_FEAT_SINGLE_MMAP (1U << 0)
+#define IORING_FEAT_NODROP (1U << 1)
/*
* io_uring_register(2) opcodes and arguments
@@ -150,5 +166,11 @@ struct io_uring_params {
#define IORING_UNREGISTER_FILES 3
#define IORING_REGISTER_EVENTFD 4
#define IORING_UNREGISTER_EVENTFD 5
+#define IORING_REGISTER_FILES_UPDATE 6
+
+struct io_uring_files_update {
+ __u32 offset;
+ __s32 *fds;
+};
#endif