aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-08-02 13:20:44 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-08-02 13:20:44 -0700
commitb349b1181d24af1c151134a3c39725e94a5619dd (patch)
tree7347cc4035de947c22e575ac7c649c0fa8658dd1 /include
parentMerge branch 'turbostat' of git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux (diff)
parentio_uring: ensure REQ_F_ISREG is set async offload (diff)
downloadwireguard-linux-b349b1181d24af1c151134a3c39725e94a5619dd.tar.xz
wireguard-linux-b349b1181d24af1c151134a3c39725e94a5619dd.zip
Merge tag 'for-5.20/io_uring-2022-07-29' of git://git.kernel.dk/linux-block
Pull io_uring updates from Jens Axboe: - As per (valid) complaint in the last merge window, fs/io_uring.c has grown quite large these days. io_uring isn't really tied to fs either, as it supports a wide variety of functionality outside of that. Move the code to io_uring/ and split it into files that either implement a specific request type, and split some code into helpers as well. The code is organized a lot better like this, and io_uring.c is now < 4K LOC (me). - Deprecate the epoll_ctl opcode. It'll still work, just trigger a warning once if used. If we don't get any complaints on this, and I don't expect any, then we can fully remove it in a future release (me). - Improve the cancel hash locking (Hao) - kbuf cleanups (Hao) - Efficiency improvements to the task_work handling (Dylan, Pavel) - Provided buffer improvements (Dylan) - Add support for recv/recvmsg multishot support. This is similar to the accept (or poll) support for have for multishot, where a single SQE can trigger everytime data is received. For applications that expect to do more than a few receives on an instantiated socket, this greatly improves efficiency (Dylan). - Efficiency improvements for poll handling (Pavel) - Poll cancelation improvements (Pavel) - Allow specifiying a range for direct descriptor allocations (Pavel) - Cleanup the cqe32 handling (Pavel) - Move io_uring types to greatly cleanup the tracing (Pavel) - Tons of great code cleanups and improvements (Pavel) - Add a way to do sync cancelations rather than through the sqe -> cqe interface, as that's a lot easier to use for some use cases (me). - Add support to IORING_OP_MSG_RING for sending direct descriptors to a different ring. This avoids the usually problematic SCM case, as we disallow those. (me) - Make the per-command alloc cache we use for apoll generic, place limits on it, and use it for netmsg as well (me). - Various cleanups (me, Michal, Gustavo, Uros) * tag 'for-5.20/io_uring-2022-07-29' of git://git.kernel.dk/linux-block: (172 commits) io_uring: ensure REQ_F_ISREG is set async offload net: fix compat pointer in get_compat_msghdr() io_uring: Don't require reinitable percpu_ref io_uring: fix types in io_recvmsg_multishot_overflow io_uring: Use atomic_long_try_cmpxchg in __io_account_mem io_uring: support multishot in recvmsg net: copy from user before calling __get_compat_msghdr net: copy from user before calling __copy_msghdr io_uring: support 0 length iov in buffer select in compat io_uring: fix multishot ending when not polled io_uring: add netmsg cache io_uring: impose max limit on apoll cache io_uring: add abstraction around apoll cache io_uring: move apoll cache to poll.c io_uring: consolidate hash_locked io-wq handling io_uring: clear REQ_F_HASH_LOCKED on hash removal io_uring: don't race double poll setting REQ_F_ASYNC_DATA io_uring: don't miss setting REQ_F_DOUBLE_POLL io_uring: disable multishot recvmsg io_uring: only trace one of complete or overflow ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/io_uring_types.h544
-rw-r--r--include/linux/socket.h7
-rw-r--r--include/net/compat.h5
-rw-r--r--include/trace/events/io_uring.h174
-rw-r--r--include/uapi/linux/io_uring.h69
5 files changed, 704 insertions, 95 deletions
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
new file mode 100644
index 000000000000..d54b8b7e0746
--- /dev/null
+++ b/include/linux/io_uring_types.h
@@ -0,0 +1,544 @@
+#ifndef IO_URING_TYPES_H
+#define IO_URING_TYPES_H
+
+#include <linux/blkdev.h>
+#include <linux/task_work.h>
+#include <linux/bitmap.h>
+#include <uapi/linux/io_uring.h>
+
+struct io_wq_work_node {
+ struct io_wq_work_node *next;
+};
+
+struct io_wq_work_list {
+ struct io_wq_work_node *first;
+ struct io_wq_work_node *last;
+};
+
+struct io_wq_work {
+ struct io_wq_work_node list;
+ unsigned flags;
+ /* place it here instead of io_kiocb as it fills padding and saves 4B */
+ int cancel_seq;
+};
+
+struct io_fixed_file {
+ /* file * with additional FFS_* flags */
+ unsigned long file_ptr;
+};
+
+struct io_file_table {
+ struct io_fixed_file *files;
+ unsigned long *bitmap;
+ unsigned int alloc_hint;
+};
+
+struct io_hash_bucket {
+ spinlock_t lock;
+ struct hlist_head list;
+} ____cacheline_aligned_in_smp;
+
+struct io_hash_table {
+ struct io_hash_bucket *hbs;
+ unsigned hash_bits;
+};
+
+struct io_uring {
+ u32 head ____cacheline_aligned_in_smp;
+ u32 tail ____cacheline_aligned_in_smp;
+};
+
+/*
+ * This data is shared with the application through the mmap at offsets
+ * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
+ *
+ * The offsets to the member fields are published through struct
+ * io_sqring_offsets when calling io_uring_setup.
+ */
+struct io_rings {
+ /*
+ * Head and tail offsets into the ring; the offsets need to be
+ * masked to get valid indices.
+ *
+ * The kernel controls head of the sq ring and the tail of the cq ring,
+ * and the application controls tail of the sq ring and the head of the
+ * cq ring.
+ */
+ struct io_uring sq, cq;
+ /*
+ * Bitmasks to apply to head and tail offsets (constant, equals
+ * ring_entries - 1)
+ */
+ u32 sq_ring_mask, cq_ring_mask;
+ /* Ring sizes (constant, power of 2) */
+ u32 sq_ring_entries, cq_ring_entries;
+ /*
+ * Number of invalid entries dropped by the kernel due to
+ * invalid index stored in array
+ *
+ * Written by the kernel, shouldn't be modified by the
+ * application (i.e. get number of "new events" by comparing to
+ * cached value).
+ *
+ * After a new SQ head value was read by the application this
+ * counter includes all submissions that were dropped reaching
+ * the new SQ head (and possibly more).
+ */
+ u32 sq_dropped;
+ /*
+ * Runtime SQ flags
+ *
+ * Written by the kernel, shouldn't be modified by the
+ * application.
+ *
+ * The application needs a full memory barrier before checking
+ * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
+ */
+ atomic_t sq_flags;
+ /*
+ * Runtime CQ flags
+ *
+ * Written by the application, shouldn't be modified by the
+ * kernel.
+ */
+ u32 cq_flags;
+ /*
+ * Number of completion events lost because the queue was full;
+ * this should be avoided by the application by making sure
+ * there are not more requests pending than there is space in
+ * the completion queue.
+ *
+ * Written by the kernel, shouldn't be modified by the
+ * application (i.e. get number of "new events" by comparing to
+ * cached value).
+ *
+ * As completion events come in out of order this counter is not
+ * ordered with any other data.
+ */
+ u32 cq_overflow;
+ /*
+ * Ring buffer of completion events.
+ *
+ * The kernel writes completion events fresh every time they are
+ * produced, so the application is allowed to modify pending
+ * entries.
+ */
+ struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
+};
+
+struct io_restriction {
+ DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
+ DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
+ u8 sqe_flags_allowed;
+ u8 sqe_flags_required;
+ bool registered;
+};
+
+struct io_submit_link {
+ struct io_kiocb *head;
+ struct io_kiocb *last;
+};
+
+struct io_submit_state {
+ /* inline/task_work completion list, under ->uring_lock */
+ struct io_wq_work_node free_list;
+ /* batch completion logic */
+ struct io_wq_work_list compl_reqs;
+ struct io_submit_link link;
+
+ bool plug_started;
+ bool need_plug;
+ unsigned short submit_nr;
+ struct blk_plug plug;
+};
+
+struct io_ev_fd {
+ struct eventfd_ctx *cq_ev_fd;
+ unsigned int eventfd_async: 1;
+ struct rcu_head rcu;
+};
+
+struct io_alloc_cache {
+ struct hlist_head list;
+ unsigned int nr_cached;
+};
+
+struct io_ring_ctx {
+ /* const or read-mostly hot data */
+ struct {
+ struct percpu_ref refs;
+
+ struct io_rings *rings;
+ unsigned int flags;
+ enum task_work_notify_mode notify_method;
+ unsigned int compat: 1;
+ unsigned int drain_next: 1;
+ unsigned int restricted: 1;
+ unsigned int off_timeout_used: 1;
+ unsigned int drain_active: 1;
+ unsigned int drain_disabled: 1;
+ unsigned int has_evfd: 1;
+ unsigned int syscall_iopoll: 1;
+ } ____cacheline_aligned_in_smp;
+
+ /* submission data */
+ struct {
+ struct mutex uring_lock;
+
+ /*
+ * Ring buffer of indices into array of io_uring_sqe, which is
+ * mmapped by the application using the IORING_OFF_SQES offset.
+ *
+ * This indirection could e.g. be used to assign fixed
+ * io_uring_sqe entries to operations and only submit them to
+ * the queue when needed.
+ *
+ * The kernel modifies neither the indices array nor the entries
+ * array.
+ */
+ u32 *sq_array;
+ struct io_uring_sqe *sq_sqes;
+ unsigned cached_sq_head;
+ unsigned sq_entries;
+
+ /*
+ * Fixed resources fast path, should be accessed only under
+ * uring_lock, and updated through io_uring_register(2)
+ */
+ struct io_rsrc_node *rsrc_node;
+ int rsrc_cached_refs;
+ atomic_t cancel_seq;
+ struct io_file_table file_table;
+ unsigned nr_user_files;
+ unsigned nr_user_bufs;
+ struct io_mapped_ubuf **user_bufs;
+
+ struct io_submit_state submit_state;
+
+ struct io_buffer_list *io_bl;
+ struct xarray io_bl_xa;
+ struct list_head io_buffers_cache;
+
+ struct io_hash_table cancel_table_locked;
+ struct list_head cq_overflow_list;
+ struct io_alloc_cache apoll_cache;
+ struct io_alloc_cache netmsg_cache;
+ } ____cacheline_aligned_in_smp;
+
+ /* IRQ completion list, under ->completion_lock */
+ struct io_wq_work_list locked_free_list;
+ unsigned int locked_free_nr;
+
+ const struct cred *sq_creds; /* cred used for __io_sq_thread() */
+ struct io_sq_data *sq_data; /* if using sq thread polling */
+
+ struct wait_queue_head sqo_sq_wait;
+ struct list_head sqd_list;
+
+ unsigned long check_cq;
+
+ unsigned int file_alloc_start;
+ unsigned int file_alloc_end;
+
+ struct xarray personalities;
+ u32 pers_next;
+
+ struct {
+ /*
+ * We cache a range of free CQEs we can use, once exhausted it
+ * should go through a slower range setup, see __io_get_cqe()
+ */
+ struct io_uring_cqe *cqe_cached;
+ struct io_uring_cqe *cqe_sentinel;
+
+ unsigned cached_cq_tail;
+ unsigned cq_entries;
+ struct io_ev_fd __rcu *io_ev_fd;
+ struct wait_queue_head cq_wait;
+ unsigned cq_extra;
+ } ____cacheline_aligned_in_smp;
+
+ struct {
+ spinlock_t completion_lock;
+
+ /*
+ * ->iopoll_list is protected by the ctx->uring_lock for
+ * io_uring instances that don't use IORING_SETUP_SQPOLL.
+ * For SQPOLL, only the single threaded io_sq_thread() will
+ * manipulate the list, hence no extra locking is needed there.
+ */
+ struct io_wq_work_list iopoll_list;
+ struct io_hash_table cancel_table;
+ bool poll_multi_queue;
+
+ struct list_head io_buffers_comp;
+ } ____cacheline_aligned_in_smp;
+
+ /* timeouts */
+ struct {
+ spinlock_t timeout_lock;
+ atomic_t cq_timeouts;
+ struct list_head timeout_list;
+ struct list_head ltimeout_list;
+ unsigned cq_last_tm_flush;
+ } ____cacheline_aligned_in_smp;
+
+ /* Keep this last, we don't need it for the fast path */
+
+ struct io_restriction restrictions;
+ struct task_struct *submitter_task;
+
+ /* slow path rsrc auxilary data, used by update/register */
+ struct io_rsrc_node *rsrc_backup_node;
+ struct io_mapped_ubuf *dummy_ubuf;
+ struct io_rsrc_data *file_data;
+ struct io_rsrc_data *buf_data;
+
+ struct delayed_work rsrc_put_work;
+ struct llist_head rsrc_put_llist;
+ struct list_head rsrc_ref_list;
+ spinlock_t rsrc_ref_lock;
+
+ struct list_head io_buffers_pages;
+
+ #if defined(CONFIG_UNIX)
+ struct socket *ring_sock;
+ #endif
+ /* hashed buffered write serialization */
+ struct io_wq_hash *hash_map;
+
+ /* Only used for accounting purposes */
+ struct user_struct *user;
+ struct mm_struct *mm_account;
+
+ /* ctx exit and cancelation */
+ struct llist_head fallback_llist;
+ struct delayed_work fallback_work;
+ struct work_struct exit_work;
+ struct list_head tctx_list;
+ struct completion ref_comp;
+
+ /* io-wq management, e.g. thread count */
+ u32 iowq_limits[2];
+ bool iowq_limits_set;
+
+ struct list_head defer_list;
+ unsigned sq_thread_idle;
+ /* protected by ->completion_lock */
+ unsigned evfd_last_cq_tail;
+};
+
+enum {
+ REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
+ REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
+ REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
+ REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
+ REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
+ REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
+ REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT,
+
+ /* first byte is taken by user flags, shift it to not overlap */
+ REQ_F_FAIL_BIT = 8,
+ REQ_F_INFLIGHT_BIT,
+ REQ_F_CUR_POS_BIT,
+ REQ_F_NOWAIT_BIT,
+ REQ_F_LINK_TIMEOUT_BIT,
+ REQ_F_NEED_CLEANUP_BIT,
+ REQ_F_POLLED_BIT,
+ REQ_F_BUFFER_SELECTED_BIT,
+ REQ_F_BUFFER_RING_BIT,
+ REQ_F_REISSUE_BIT,
+ REQ_F_CREDS_BIT,
+ REQ_F_REFCOUNT_BIT,
+ REQ_F_ARM_LTIMEOUT_BIT,
+ REQ_F_ASYNC_DATA_BIT,
+ REQ_F_SKIP_LINK_CQES_BIT,
+ REQ_F_SINGLE_POLL_BIT,
+ REQ_F_DOUBLE_POLL_BIT,
+ REQ_F_PARTIAL_IO_BIT,
+ REQ_F_CQE32_INIT_BIT,
+ REQ_F_APOLL_MULTISHOT_BIT,
+ REQ_F_CLEAR_POLLIN_BIT,
+ REQ_F_HASH_LOCKED_BIT,
+ /* keep async read/write and isreg together and in order */
+ REQ_F_SUPPORT_NOWAIT_BIT,
+ REQ_F_ISREG_BIT,
+
+ /* not a real bit, just to check we're not overflowing the space */
+ __REQ_F_LAST_BIT,
+};
+
+enum {
+ /* ctx owns file */
+ REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
+ /* drain existing IO first */
+ REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
+ /* linked sqes */
+ REQ_F_LINK = BIT(REQ_F_LINK_BIT),
+ /* doesn't sever on completion < 0 */
+ REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
+ /* IOSQE_ASYNC */
+ REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
+ /* IOSQE_BUFFER_SELECT */
+ REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
+ /* IOSQE_CQE_SKIP_SUCCESS */
+ REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT),
+
+ /* fail rest of links */
+ REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
+ /* on inflight list, should be cancelled and waited on exit reliably */
+ REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
+ /* read/write uses file position */
+ REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
+ /* must not punt to workers */
+ REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
+ /* has or had linked timeout */
+ REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
+ /* needs cleanup */
+ REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
+ /* already went through poll handler */
+ REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
+ /* buffer already selected */
+ REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
+ /* buffer selected from ring, needs commit */
+ REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT),
+ /* caller should reissue async */
+ REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
+ /* supports async reads/writes */
+ REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
+ /* regular file */
+ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
+ /* has creds assigned */
+ REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
+ /* skip refcounting if not set */
+ REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
+ /* there is a linked timeout that has to be armed */
+ REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
+ /* ->async_data allocated */
+ REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
+ /* don't post CQEs while failing linked requests */
+ REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
+ /* single poll may be active */
+ REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
+ /* double poll may active */
+ REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
+ /* request has already done partial IO */
+ REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
+ /* fast poll multishot mode */
+ REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
+ /* ->extra1 and ->extra2 are initialised */
+ REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT),
+ /* recvmsg special flag, clear EPOLLIN */
+ REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT),
+ /* hashed into ->cancel_hash_locked, protected by ->uring_lock */
+ REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT),
+};
+
+typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
+
+struct io_task_work {
+ struct llist_node node;
+ io_req_tw_func_t func;
+};
+
+struct io_cqe {
+ __u64 user_data;
+ __s32 res;
+ /* fd initially, then cflags for completion */
+ union {
+ __u32 flags;
+ int fd;
+ };
+};
+
+/*
+ * Each request type overlays its private data structure on top of this one.
+ * They must not exceed this one in size.
+ */
+struct io_cmd_data {
+ struct file *file;
+ /* each command gets 56 bytes of data */
+ __u8 data[56];
+};
+
+#define io_kiocb_to_cmd(req) ((void *) &(req)->cmd)
+#define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr)
+
+struct io_kiocb {
+ union {
+ /*
+ * NOTE! Each of the io_kiocb union members has the file pointer
+ * as the first entry in their struct definition. So you can
+ * access the file pointer through any of the sub-structs,
+ * or directly as just 'file' in this struct.
+ */
+ struct file *file;
+ struct io_cmd_data cmd;
+ };
+
+ u8 opcode;
+ /* polled IO has completed */
+ u8 iopoll_completed;
+ /*
+ * Can be either a fixed buffer index, or used with provided buffers.
+ * For the latter, before issue it points to the buffer group ID,
+ * and after selection it points to the buffer ID itself.
+ */
+ u16 buf_index;
+ unsigned int flags;
+
+ struct io_cqe cqe;
+
+ struct io_ring_ctx *ctx;
+ struct task_struct *task;
+
+ struct io_rsrc_node *rsrc_node;
+
+ union {
+ /* store used ubuf, so we can prevent reloading */
+ struct io_mapped_ubuf *imu;
+
+ /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+ struct io_buffer *kbuf;
+
+ /*
+ * stores buffer ID for ring provided buffers, valid IFF
+ * REQ_F_BUFFER_RING is set.
+ */
+ struct io_buffer_list *buf_list;
+ };
+
+ union {
+ /* used by request caches, completion batching and iopoll */
+ struct io_wq_work_node comp_list;
+ /* cache ->apoll->events */
+ __poll_t apoll_events;
+ };
+ atomic_t refs;
+ atomic_t poll_refs;
+ struct io_task_work io_task_work;
+ /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
+ union {
+ struct hlist_node hash_node;
+ struct {
+ u64 extra1;
+ u64 extra2;
+ };
+ };
+ /* internal polling, see IORING_FEAT_FAST_POLL */
+ struct async_poll *apoll;
+ /* opcode allocated if it needs to store data for async defer */
+ void *async_data;
+ /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
+ struct io_kiocb *link;
+ /* custom credentials, valid IFF REQ_F_CREDS is set */
+ const struct cred *creds;
+ struct io_wq_work work;
+};
+
+struct io_overflow_cqe {
+ struct list_head list;
+ struct io_uring_cqe cqe;
+};
+
+#endif
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 17311ad9f9af..be24f1c8568a 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -416,10 +416,9 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg,
struct user_msghdr __user *umsg, unsigned flags,
struct sockaddr __user **uaddr,
struct iovec **iov);
-extern int __copy_msghdr_from_user(struct msghdr *kmsg,
- struct user_msghdr __user *umsg,
- struct sockaddr __user **save_addr,
- struct iovec __user **uiov, size_t *nsegs);
+extern int __copy_msghdr(struct msghdr *kmsg,
+ struct user_msghdr *umsg,
+ struct sockaddr __user **save_addr);
/* helpers which do the actual work for syscalls */
extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
diff --git a/include/net/compat.h b/include/net/compat.h
index 595fee069b82..84c163f40f38 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -46,9 +46,8 @@ struct compat_rtentry {
unsigned short rt_irtt; /* Initial RTT */
};
-int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg,
- struct sockaddr __user **save_addr, compat_uptr_t *ptr,
- compat_size_t *len);
+int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr *msg,
+ struct sockaddr __user **save_addr);
int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *,
struct sockaddr __user **, struct iovec **);
int put_cmsg_compat(struct msghdr*, int, int, int, void *);
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index aa2f951b07cd..95a8cfaad15a 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -7,6 +7,7 @@
#include <linux/tracepoint.h>
#include <uapi/linux/io_uring.h>
+#include <linux/io_uring_types.h>
#include <linux/io_uring.h>
struct io_wq_work;
@@ -97,9 +98,7 @@ TRACE_EVENT(io_uring_register,
/**
* io_uring_file_get - called before getting references to an SQE file
*
- * @ctx: pointer to a ring context structure
* @req: pointer to a submitted request
- * @user_data: user data associated with the request
* @fd: SQE file descriptor
*
* Allows to trace out how often an SQE file reference is obtained, which can
@@ -108,9 +107,9 @@ TRACE_EVENT(io_uring_register,
*/
TRACE_EVENT(io_uring_file_get,
- TP_PROTO(void *ctx, void *req, unsigned long long user_data, int fd),
+ TP_PROTO(struct io_kiocb *req, int fd),
- TP_ARGS(ctx, req, user_data, fd),
+ TP_ARGS(req, fd),
TP_STRUCT__entry (
__field( void *, ctx )
@@ -120,9 +119,9 @@ TRACE_EVENT(io_uring_file_get,
),
TP_fast_assign(
- __entry->ctx = ctx;
+ __entry->ctx = req->ctx;
__entry->req = req;
- __entry->user_data = user_data;
+ __entry->user_data = req->cqe.user_data;
__entry->fd = fd;
),
@@ -133,22 +132,16 @@ TRACE_EVENT(io_uring_file_get,
/**
* io_uring_queue_async_work - called before submitting a new async work
*
- * @ctx: pointer to a ring context structure
* @req: pointer to a submitted request
- * @user_data: user data associated with the request
- * @opcode: opcode of request
- * @flags request flags
- * @work: pointer to a submitted io_wq_work
* @rw: type of workqueue, hashed or normal
*
* Allows to trace asynchronous work submission.
*/
TRACE_EVENT(io_uring_queue_async_work,
- TP_PROTO(void *ctx, void * req, unsigned long long user_data, u8 opcode,
- unsigned int flags, struct io_wq_work *work, int rw),
+ TP_PROTO(struct io_kiocb *req, int rw),
- TP_ARGS(ctx, req, user_data, opcode, flags, work, rw),
+ TP_ARGS(req, rw),
TP_STRUCT__entry (
__field( void *, ctx )
@@ -159,19 +152,19 @@ TRACE_EVENT(io_uring_queue_async_work,
__field( struct io_wq_work *, work )
__field( int, rw )
- __string( op_str, io_uring_get_opcode(opcode) )
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
- __entry->ctx = ctx;
+ __entry->ctx = req->ctx;
__entry->req = req;
- __entry->user_data = user_data;
- __entry->flags = flags;
- __entry->opcode = opcode;
- __entry->work = work;
+ __entry->user_data = req->cqe.user_data;
+ __entry->flags = req->flags;
+ __entry->opcode = req->opcode;
+ __entry->work = &req->work;
__entry->rw = rw;
- __assign_str(op_str, io_uring_get_opcode(opcode));
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p",
@@ -183,19 +176,16 @@ TRACE_EVENT(io_uring_queue_async_work,
/**
* io_uring_defer - called when an io_uring request is deferred
*
- * @ctx: pointer to a ring context structure
* @req: pointer to a deferred request
- * @user_data: user data associated with the request
- * @opcode: opcode of request
*
* Allows to track deferred requests, to get an insight about what requests are
* not started immediately.
*/
TRACE_EVENT(io_uring_defer,
- TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode),
+ TP_PROTO(struct io_kiocb *req),
- TP_ARGS(ctx, req, user_data, opcode),
+ TP_ARGS(req),
TP_STRUCT__entry (
__field( void *, ctx )
@@ -203,16 +193,16 @@ TRACE_EVENT(io_uring_defer,
__field( unsigned long long, data )
__field( u8, opcode )
- __string( op_str, io_uring_get_opcode(opcode) )
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
- __entry->ctx = ctx;
+ __entry->ctx = req->ctx;
__entry->req = req;
- __entry->data = user_data;
- __entry->opcode = opcode;
+ __entry->data = req->cqe.user_data;
+ __entry->opcode = req->opcode;
- __assign_str(op_str, io_uring_get_opcode(opcode));
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s",
@@ -224,7 +214,6 @@ TRACE_EVENT(io_uring_defer,
* io_uring_link - called before the io_uring request added into link_list of
* another request
*
- * @ctx: pointer to a ring context structure
* @req: pointer to a linked request
* @target_req: pointer to a previous request, that would contain @req
*
@@ -233,9 +222,9 @@ TRACE_EVENT(io_uring_defer,
*/
TRACE_EVENT(io_uring_link,
- TP_PROTO(void *ctx, void *req, void *target_req),
+ TP_PROTO(struct io_kiocb *req, struct io_kiocb *target_req),
- TP_ARGS(ctx, req, target_req),
+ TP_ARGS(req, target_req),
TP_STRUCT__entry (
__field( void *, ctx )
@@ -244,7 +233,7 @@ TRACE_EVENT(io_uring_link,
),
TP_fast_assign(
- __entry->ctx = ctx;
+ __entry->ctx = req->ctx;
__entry->req = req;
__entry->target_req = target_req;
),
@@ -285,10 +274,7 @@ TRACE_EVENT(io_uring_cqring_wait,
/**
* io_uring_fail_link - called before failing a linked request
*
- * @ctx: pointer to a ring context structure
* @req: request, which links were cancelled
- * @user_data: user data associated with the request
- * @opcode: opcode of request
* @link: cancelled link
*
* Allows to track linked requests cancellation, to see not only that some work
@@ -296,9 +282,9 @@ TRACE_EVENT(io_uring_cqring_wait,
*/
TRACE_EVENT(io_uring_fail_link,
- TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, void *link),
+ TP_PROTO(struct io_kiocb *req, struct io_kiocb *link),
- TP_ARGS(ctx, req, user_data, opcode, link),
+ TP_ARGS(req, link),
TP_STRUCT__entry (
__field( void *, ctx )
@@ -307,17 +293,17 @@ TRACE_EVENT(io_uring_fail_link,
__field( u8, opcode )
__field( void *, link )
- __string( op_str, io_uring_get_opcode(opcode) )
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
- __entry->ctx = ctx;
+ __entry->ctx = req->ctx;
__entry->req = req;
- __entry->user_data = user_data;
- __entry->opcode = opcode;
+ __entry->user_data = req->cqe.user_data;
+ __entry->opcode = req->opcode;
__entry->link = link;
- __assign_str(op_str, io_uring_get_opcode(opcode));
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p",
@@ -376,23 +362,17 @@ TRACE_EVENT(io_uring_complete,
/**
* io_uring_submit_sqe - called before submitting one SQE
*
- * @ctx: pointer to a ring context structure
* @req: pointer to a submitted request
- * @user_data: user data associated with the request
- * @opcode: opcode of request
- * @flags request flags
* @force_nonblock: whether a context blocking or not
- * @sq_thread: true if sq_thread has submitted this SQE
*
* Allows to track SQE submitting, to understand what was the source of it, SQ
* thread or io_uring_enter call.
*/
TRACE_EVENT(io_uring_submit_sqe,
- TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, u32 flags,
- bool force_nonblock, bool sq_thread),
+ TP_PROTO(struct io_kiocb *req, bool force_nonblock),
- TP_ARGS(ctx, req, user_data, opcode, flags, force_nonblock, sq_thread),
+ TP_ARGS(req, force_nonblock),
TP_STRUCT__entry (
__field( void *, ctx )
@@ -403,19 +383,19 @@ TRACE_EVENT(io_uring_submit_sqe,
__field( bool, force_nonblock )
__field( bool, sq_thread )
- __string( op_str, io_uring_get_opcode(opcode) )
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
- __entry->ctx = ctx;
+ __entry->ctx = req->ctx;
__entry->req = req;
- __entry->user_data = user_data;
- __entry->opcode = opcode;
- __entry->flags = flags;
+ __entry->user_data = req->cqe.user_data;
+ __entry->opcode = req->opcode;
+ __entry->flags = req->flags;
__entry->force_nonblock = force_nonblock;
- __entry->sq_thread = sq_thread;
+ __entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL;
- __assign_str(op_str, io_uring_get_opcode(opcode));
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
@@ -427,10 +407,7 @@ TRACE_EVENT(io_uring_submit_sqe,
/*
* io_uring_poll_arm - called after arming a poll wait if successful
*
- * @ctx: pointer to a ring context structure
* @req: pointer to the armed request
- * @user_data: user data associated with the request
- * @opcode: opcode of request
* @mask: request poll events mask
* @events: registered events of interest
*
@@ -439,10 +416,9 @@ TRACE_EVENT(io_uring_submit_sqe,
*/
TRACE_EVENT(io_uring_poll_arm,
- TP_PROTO(void *ctx, void *req, u64 user_data, u8 opcode,
- int mask, int events),
+ TP_PROTO(struct io_kiocb *req, int mask, int events),
- TP_ARGS(ctx, req, user_data, opcode, mask, events),
+ TP_ARGS(req, mask, events),
TP_STRUCT__entry (
__field( void *, ctx )
@@ -452,18 +428,18 @@ TRACE_EVENT(io_uring_poll_arm,
__field( int, mask )
__field( int, events )
- __string( op_str, io_uring_get_opcode(opcode) )
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
- __entry->ctx = ctx;
+ __entry->ctx = req->ctx;
__entry->req = req;
- __entry->user_data = user_data;
- __entry->opcode = opcode;
+ __entry->user_data = req->cqe.user_data;
+ __entry->opcode = req->opcode;
__entry->mask = mask;
__entry->events = events;
- __assign_str(op_str, io_uring_get_opcode(opcode));
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x",
@@ -475,18 +451,15 @@ TRACE_EVENT(io_uring_poll_arm,
/*
* io_uring_task_add - called after adding a task
*
- * @ctx: pointer to a ring context structure
* @req: pointer to request
- * @user_data: user data associated with the request
- * @opcode: opcode of request
* @mask: request poll events mask
*
*/
TRACE_EVENT(io_uring_task_add,
- TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, int mask),
+ TP_PROTO(struct io_kiocb *req, int mask),
- TP_ARGS(ctx, req, user_data, opcode, mask),
+ TP_ARGS(req, mask),
TP_STRUCT__entry (
__field( void *, ctx )
@@ -495,17 +468,17 @@ TRACE_EVENT(io_uring_task_add,
__field( u8, opcode )
__field( int, mask )
- __string( op_str, io_uring_get_opcode(opcode) )
+ __string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
- __entry->ctx = ctx;
+ __entry->ctx = req->ctx;
__entry->req = req;
- __entry->user_data = user_data;
- __entry->opcode = opcode;
+ __entry->user_data = req->cqe.user_data;
+ __entry->opcode = req->opcode;
__entry->mask = mask;
- __assign_str(op_str, io_uring_get_opcode(opcode));
+ __assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x",
@@ -518,7 +491,6 @@ TRACE_EVENT(io_uring_task_add,
* io_uring_req_failed - called when an sqe is errored dring submission
*
* @sqe: pointer to the io_uring_sqe that failed
- * @ctx: pointer to a ring context structure
* @req: pointer to request
* @error: error it failed with
*
@@ -526,9 +498,9 @@ TRACE_EVENT(io_uring_task_add,
*/
TRACE_EVENT(io_uring_req_failed,
- TP_PROTO(const struct io_uring_sqe *sqe, void *ctx, void *req, int error),
+ TP_PROTO(const struct io_uring_sqe *sqe, struct io_kiocb *req, int error),
- TP_ARGS(sqe, ctx, req, error),
+ TP_ARGS(sqe, req, error),
TP_STRUCT__entry (
__field( void *, ctx )
@@ -552,7 +524,7 @@ TRACE_EVENT(io_uring_req_failed,
),
TP_fast_assign(
- __entry->ctx = ctx;
+ __entry->ctx = req->ctx;
__entry->req = req;
__entry->user_data = sqe->user_data;
__entry->opcode = sqe->opcode;
@@ -622,12 +594,42 @@ TRACE_EVENT(io_uring_cqe_overflow,
__entry->ocqe = ocqe;
),
- TP_printk("ring %p, user_data 0x%llx, res %d, flags %x, "
+ TP_printk("ring %p, user_data 0x%llx, res %d, cflags 0x%x, "
"overflow_cqe %p",
__entry->ctx, __entry->user_data, __entry->res,
__entry->cflags, __entry->ocqe)
);
+/*
+ * io_uring_task_work_run - ran task work
+ *
+ * @tctx: pointer to a io_uring_task
+ * @count: how many functions it ran
+ * @loops: how many loops it ran
+ *
+ */
+TRACE_EVENT(io_uring_task_work_run,
+
+ TP_PROTO(void *tctx, unsigned int count, unsigned int loops),
+
+ TP_ARGS(tctx, count, loops),
+
+ TP_STRUCT__entry (
+ __field( void *, tctx )
+ __field( unsigned int, count )
+ __field( unsigned int, loops )
+ ),
+
+ TP_fast_assign(
+ __entry->tctx = tctx;
+ __entry->count = count;
+ __entry->loops = loops;
+ ),
+
+ TP_printk("tctx %p, count %u, loops %u",
+ __entry->tctx, __entry->count, __entry->loops)
+);
+
#endif /* _TRACE_IO_URING_H */
/* This part must be outside protection */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 0ad3da28d2fc..4c9b11e2e991 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -10,6 +10,7 @@
#include <linux/fs.h>
#include <linux/types.h>
+#include <linux/time_types.h>
/*
* IO submission data structure (Submission Queue Entry)
@@ -50,6 +51,7 @@ struct io_uring_sqe {
__u32 unlink_flags;
__u32 hardlink_flags;
__u32 xattr_flags;
+ __u32 msg_ring_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
@@ -140,9 +142,12 @@ enum {
* IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
*/
#define IORING_SETUP_TASKRUN_FLAG (1U << 9)
-
#define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */
#define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */
+/*
+ * Only one task is allowed to submit requests
+ */
+#define IORING_SETUP_SINGLE_ISSUER (1U << 12)
enum io_uring_op {
IORING_OP_NOP,
@@ -229,10 +234,13 @@ enum io_uring_op {
*
* IORING_POLL_UPDATE Update existing poll request, matching
* sqe->addr as the old user_data field.
+ *
+ * IORING_POLL_LEVEL Level triggered poll.
*/
#define IORING_POLL_ADD_MULTI (1U << 0)
#define IORING_POLL_UPDATE_EVENTS (1U << 1)
#define IORING_POLL_UPDATE_USER_DATA (1U << 2)
+#define IORING_POLL_ADD_LEVEL (1U << 3)
/*
* ASYNC_CANCEL flags.
@@ -241,10 +249,12 @@ enum io_uring_op {
* IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the
* request 'user_data'
* IORING_ASYNC_CANCEL_ANY Match any request
+ * IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor
*/
#define IORING_ASYNC_CANCEL_ALL (1U << 0)
#define IORING_ASYNC_CANCEL_FD (1U << 1)
#define IORING_ASYNC_CANCEL_ANY (1U << 2)
+#define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3)
/*
* send/sendmsg and recv/recvmsg flags (sqe->ioprio)
@@ -253,8 +263,13 @@ enum io_uring_op {
* or receive and arm poll if that yields an
* -EAGAIN result, arm poll upfront and skip
* the initial transfer attempt.
+ *
+ * IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if
+ * the handler will continue to report
+ * CQEs on behalf of the same SQE.
*/
#define IORING_RECVSEND_POLL_FIRST (1U << 0)
+#define IORING_RECV_MULTISHOT (1U << 1)
/*
* accept flags stored in sqe->ioprio
@@ -262,6 +277,22 @@ enum io_uring_op {
#define IORING_ACCEPT_MULTISHOT (1U << 0)
/*
+ * IORING_OP_MSG_RING command types, stored in sqe->addr
+ */
+enum {
+ IORING_MSG_DATA, /* pass sqe->len as 'res' and off as user_data */
+ IORING_MSG_SEND_FD, /* send a registered fd to another ring */
+};
+
+/*
+ * IORING_OP_MSG_RING flags (sqe->msg_ring_flags)
+ *
+ * IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. Not
+ * applicable for IORING_MSG_DATA, obviously.
+ */
+#define IORING_MSG_RING_CQE_SKIP (1U << 0)
+
+/*
* IO completion data structure (Completion Queue Entry)
*/
struct io_uring_cqe {
@@ -420,6 +451,12 @@ enum {
IORING_REGISTER_PBUF_RING = 22,
IORING_UNREGISTER_PBUF_RING = 23,
+ /* sync cancelation API */
+ IORING_REGISTER_SYNC_CANCEL = 24,
+
+ /* register a range of fixed file slots for automatic slot allocation */
+ IORING_REGISTER_FILE_ALLOC_RANGE = 25,
+
/* this goes last */
IORING_REGISTER_LAST
};
@@ -483,7 +520,7 @@ struct io_uring_probe {
__u8 ops_len; /* length of ops[] array below */
__u16 resv;
__u32 resv2[3];
- struct io_uring_probe_op ops[0];
+ struct io_uring_probe_op ops[];
};
struct io_uring_restriction {
@@ -555,4 +592,32 @@ struct io_uring_getevents_arg {
__u64 ts;
};
+/*
+ * Argument for IORING_REGISTER_SYNC_CANCEL
+ */
+struct io_uring_sync_cancel_reg {
+ __u64 addr;
+ __s32 fd;
+ __u32 flags;
+ struct __kernel_timespec timeout;
+ __u64 pad[4];
+};
+
+/*
+ * Argument for IORING_REGISTER_FILE_ALLOC_RANGE
+ * The range is specified as [off, off + len)
+ */
+struct io_uring_file_index_range {
+ __u32 off;
+ __u32 len;
+ __u64 resv;
+};
+
+struct io_uring_recvmsg_out {
+ __u32 namelen;
+ __u32 controllen;
+ __u32 payloadlen;
+ __u32 flags;
+};
+
#endif