aboutsummaryrefslogtreecommitdiffstats
path: root/fs/aio.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/aio.c')
-rw-r--r--fs/aio.c311
1 files changed, 226 insertions, 85 deletions
diff --git a/fs/aio.c b/fs/aio.c
index 5f3d3d814928..5b2ff20ad322 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -27,7 +27,6 @@
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
-#include <linux/mmu_context.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/timer.h>
@@ -44,7 +43,6 @@
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
-#include <asm/kmap_types.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
@@ -68,7 +66,7 @@ struct aio_ring {
unsigned header_length; /* size of aio_ring */
- struct io_event io_events[0];
+ struct io_event io_events[];
}; /* 128 bytes + ring size */
/*
@@ -176,14 +174,16 @@ struct fsync_iocb {
struct file *file;
struct work_struct work;
bool datasync;
+ struct cred *creds;
};
struct poll_iocb {
struct file *file;
struct wait_queue_head *head;
__poll_t events;
- bool done;
bool cancelled;
+ bool work_scheduled;
+ bool work_need_resched;
struct wait_queue_entry wait;
struct work_struct work;
};
@@ -220,9 +220,35 @@ struct aio_kiocb {
/*------ sysctl variables----*/
static DEFINE_SPINLOCK(aio_nr_lock);
-unsigned long aio_nr; /* current system wide number of aio requests */
-unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
+static unsigned long aio_nr; /* current system wide number of aio requests */
+static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
/*----end sysctl variables---*/
+#ifdef CONFIG_SYSCTL
+static struct ctl_table aio_sysctls[] = {
+ {
+ .procname = "aio-nr",
+ .data = &aio_nr,
+ .maxlen = sizeof(aio_nr),
+ .mode = 0444,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "aio-max-nr",
+ .data = &aio_max_nr,
+ .maxlen = sizeof(aio_max_nr),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {}
+};
+
+static void __init aio_sysctl_init(void)
+{
+ register_sysctl_init("fs", aio_sysctls);
+}
+#else
+#define aio_sysctl_init() do { } while (0)
+#endif
static struct kmem_cache *kiocb_cachep;
static struct kmem_cache *kioctx_cachep;
@@ -275,6 +301,7 @@ static int __init aio_setup(void)
kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+ aio_sysctl_init();
return 0;
}
__initcall(aio_setup);
@@ -373,8 +400,8 @@ static const struct file_operations aio_ring_fops = {
};
#if IS_ENABLED(CONFIG_MIGRATION)
-static int aio_migratepage(struct address_space *mapping, struct page *new,
- struct page *old, enum migrate_mode mode)
+static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
+ struct folio *src, enum migrate_mode mode)
{
struct kioctx *ctx;
unsigned long flags;
@@ -408,10 +435,10 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
goto out;
}
- idx = old->index;
+ idx = src->index;
if (idx < (pgoff_t)ctx->nr_pages) {
- /* Make sure the old page hasn't already been changed */
- if (ctx->ring_pages[idx] != old)
+ /* Make sure the old folio hasn't already been changed */
+ if (ctx->ring_pages[idx] != &src->page)
rc = -EAGAIN;
} else
rc = -EINVAL;
@@ -420,27 +447,27 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
goto out_unlock;
/* Writeback must be complete */
- BUG_ON(PageWriteback(old));
- get_page(new);
+ BUG_ON(folio_test_writeback(src));
+ folio_get(dst);
- rc = migrate_page_move_mapping(mapping, new, old, 1);
+ rc = folio_migrate_mapping(mapping, dst, src, 1);
if (rc != MIGRATEPAGE_SUCCESS) {
- put_page(new);
+ folio_put(dst);
goto out_unlock;
}
/* Take completion_lock to prevent other writes to the ring buffer
- * while the old page is copied to the new. This prevents new
+ * while the old folio is copied to the new. This prevents new
* events from being lost.
*/
spin_lock_irqsave(&ctx->completion_lock, flags);
- migrate_page_copy(new, old);
- BUG_ON(ctx->ring_pages[idx] != old);
- ctx->ring_pages[idx] = new;
+ folio_migrate_copy(dst, src);
+ BUG_ON(ctx->ring_pages[idx] != &src->page);
+ ctx->ring_pages[idx] = &dst->page;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
- /* The old page is no longer accessible. */
- put_page(old);
+ /* The old folio is no longer accessible. */
+ folio_put(src);
out_unlock:
mutex_unlock(&ctx->ring_lock);
@@ -448,13 +475,13 @@ out:
spin_unlock(&mapping->private_lock);
return rc;
}
+#else
+#define aio_migrate_folio NULL
#endif
static const struct address_space_operations aio_ctx_aops = {
- .set_page_dirty = __set_page_dirty_no_writeback,
-#if IS_ENABLED(CONFIG_MIGRATION)
- .migratepage = aio_migratepage,
-#endif
+ .dirty_folio = noop_dirty_folio,
+ .migrate_folio = aio_migrate_folio,
};
static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
@@ -519,16 +546,16 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
ctx->mmap_size = nr_pages * PAGE_SIZE;
pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
- if (down_write_killable(&mm->mmap_sem)) {
+ if (mmap_write_lock_killable(mm)) {
ctx->mmap_size = 0;
aio_free_ring(ctx);
return -EINTR;
}
- ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
- PROT_READ | PROT_WRITE,
- MAP_SHARED, 0, &unused, NULL);
- up_write(&mm->mmap_sem);
+ ctx->mmap_base = do_mmap(ctx->aio_ring_file, 0, ctx->mmap_size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED, 0, &unused, NULL);
+ mmap_write_unlock(mm);
if (IS_ERR((void *)ctx->mmap_base)) {
ctx->mmap_size = 0;
aio_free_ring(ctx);
@@ -660,8 +687,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
new_nr = (table ? table->nr : 1) * 4;
spin_unlock(&mm->ioctx_lock);
- table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
- new_nr, GFP_KERNEL);
+ table = kzalloc(struct_size(table, table, new_nr), GFP_KERNEL);
if (!table)
return -ENOMEM;
@@ -925,16 +951,13 @@ static bool __get_reqs_available(struct kioctx *ctx)
local_irq_save(flags);
kcpu = this_cpu_ptr(ctx->cpu);
if (!kcpu->reqs_available) {
- int old, avail = atomic_read(&ctx->reqs_available);
+ int avail = atomic_read(&ctx->reqs_available);
do {
if (avail < ctx->req_batch)
goto out;
-
- old = avail;
- avail = atomic_cmpxchg(&ctx->reqs_available,
- avail, avail - ctx->req_batch);
- } while (avail != old);
+ } while (!atomic_try_cmpxchg(&ctx->reqs_available,
+ &avail, avail - ctx->req_batch));
kcpu->reqs_available += ctx->req_batch;
}
@@ -1418,7 +1441,7 @@ static void aio_remove_iocb(struct aio_kiocb *iocb)
spin_unlock_irqrestore(&ctx->ctx_lock, flags);
}
-static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void aio_complete_rw(struct kiocb *kiocb, long res)
{
struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
@@ -1438,7 +1461,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
}
iocb->ki_res.res = res;
- iocb->ki_res.res2 = res2;
+ iocb->ki_res.res2 = 0;
iocb_put(iocb);
}
@@ -1449,10 +1472,9 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
- req->ki_flags = iocb_flags(req->ki_filp);
+ req->ki_flags = req->ki_filp->f_iocb_flags;
if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD;
- req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp));
if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
/*
* If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
@@ -1489,12 +1511,8 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
*iovec = NULL;
return ret;
}
-#ifdef CONFIG_COMPAT
- if (compat)
- return compat_import_iovec(rw, buf, len, UIO_FASTIOV, iovec,
- iter);
-#endif
- return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter);
+
+ return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat);
}
static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
@@ -1511,9 +1529,9 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
* may be already running. Just fail this IO with EINTR.
*/
ret = -EINTR;
- /*FALLTHRU*/
+ fallthrough;
default:
- req->ki_complete(req, ret, 0);
+ req->ki_complete(req, ret);
}
}
@@ -1531,7 +1549,6 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
file = req->ki_filp;
if (unlikely(!(file->f_mode & FMODE_READ)))
return -EBADF;
- ret = -EINVAL;
if (unlikely(!file->f_op->read_iter))
return -EINVAL;
@@ -1576,7 +1593,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
* we return to userspace.
*/
if (S_ISREG(file_inode(file)->i_mode)) {
- __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
+ sb_start_write(file_inode(file)->i_sb);
__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
}
req->ki_flags |= IOCB_WRITE;
@@ -1589,8 +1606,11 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
static void aio_fsync_work(struct work_struct *work)
{
struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
+ const struct cred *old_cred = override_creds(iocb->fsync.creds);
iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
+ revert_creds(old_cred);
+ put_cred(iocb->fsync.creds);
iocb_put(iocb);
}
@@ -1604,6 +1624,10 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
if (unlikely(!req->file->f_op->fsync))
return -EINVAL;
+ req->creds = prepare_creds();
+ if (!req->creds)
+ return -ENOMEM;
+
req->datasync = datasync;
INIT_WORK(&req->work, aio_fsync_work);
schedule_work(&req->work);
@@ -1618,6 +1642,51 @@ static void aio_poll_put_work(struct work_struct *work)
iocb_put(iocb);
}
+/*
+ * Safely lock the waitqueue which the request is on, synchronizing with the
+ * case where the ->poll() provider decides to free its waitqueue early.
+ *
+ * Returns true on success, meaning that req->head->lock was locked, req->wait
+ * is on req->head, and an RCU read lock was taken. Returns false if the
+ * request was already removed from its waitqueue (which might no longer exist).
+ */
+static bool poll_iocb_lock_wq(struct poll_iocb *req)
+{
+ wait_queue_head_t *head;
+
+ /*
+ * While we hold the waitqueue lock and the waitqueue is nonempty,
+ * wake_up_pollfree() will wait for us. However, taking the waitqueue
+ * lock in the first place can race with the waitqueue being freed.
+ *
+ * We solve this as eventpoll does: by taking advantage of the fact that
+ * all users of wake_up_pollfree() will RCU-delay the actual free. If
+ * we enter rcu_read_lock() and see that the pointer to the queue is
+ * non-NULL, we can then lock it without the memory being freed out from
+ * under us, then check whether the request is still on the queue.
+ *
+ * Keep holding rcu_read_lock() as long as we hold the queue lock, in
+ * case the caller deletes the entry from the queue, leaving it empty.
+ * In that case, only RCU prevents the queue memory from being freed.
+ */
+ rcu_read_lock();
+ head = smp_load_acquire(&req->head);
+ if (head) {
+ spin_lock(&head->lock);
+ if (!list_empty(&req->wait.entry))
+ return true;
+ spin_unlock(&head->lock);
+ }
+ rcu_read_unlock();
+ return false;
+}
+
+static void poll_iocb_unlock_wq(struct poll_iocb *req)
+{
+ spin_unlock(&req->head->lock);
+ rcu_read_unlock();
+}
+
static void aio_poll_complete_work(struct work_struct *work)
{
struct poll_iocb *req = container_of(work, struct poll_iocb, work);
@@ -1637,14 +1706,27 @@ static void aio_poll_complete_work(struct work_struct *work)
* avoid further branches in the fast path.
*/
spin_lock_irq(&ctx->ctx_lock);
- if (!mask && !READ_ONCE(req->cancelled)) {
- add_wait_queue(req->head, &req->wait);
- spin_unlock_irq(&ctx->ctx_lock);
- return;
- }
+ if (poll_iocb_lock_wq(req)) {
+ if (!mask && !READ_ONCE(req->cancelled)) {
+ /*
+ * The request isn't actually ready to be completed yet.
+ * Reschedule completion if another wakeup came in.
+ */
+ if (req->work_need_resched) {
+ schedule_work(&req->work);
+ req->work_need_resched = false;
+ } else {
+ req->work_scheduled = false;
+ }
+ poll_iocb_unlock_wq(req);
+ spin_unlock_irq(&ctx->ctx_lock);
+ return;
+ }
+ list_del_init(&req->wait.entry);
+ poll_iocb_unlock_wq(req);
+ } /* else, POLLFREE has freed the waitqueue, so we must complete */
list_del_init(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
- req->done = true;
spin_unlock_irq(&ctx->ctx_lock);
iocb_put(iocb);
@@ -1656,13 +1738,14 @@ static int aio_poll_cancel(struct kiocb *iocb)
struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
struct poll_iocb *req = &aiocb->poll;
- spin_lock(&req->head->lock);
- WRITE_ONCE(req->cancelled, true);
- if (!list_empty(&req->wait.entry)) {
- list_del_init(&req->wait.entry);
- schedule_work(&aiocb->poll.work);
- }
- spin_unlock(&req->head->lock);
+ if (poll_iocb_lock_wq(req)) {
+ WRITE_ONCE(req->cancelled, true);
+ if (!req->work_scheduled) {
+ schedule_work(&aiocb->poll.work);
+ req->work_scheduled = true;
+ }
+ poll_iocb_unlock_wq(req);
+ } /* else, the request was force-cancelled by POLLFREE already */
return 0;
}
@@ -1679,21 +1762,27 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (mask && !(mask & req->events))
return 0;
- list_del_init(&req->wait.entry);
-
- if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
+ /*
+ * Complete the request inline if possible. This requires that three
+ * conditions be met:
+ * 1. An event mask must have been passed. If a plain wakeup was done
+ * instead, then mask == 0 and we have to call vfs_poll() to get
+ * the events, so inline completion isn't possible.
+ * 2. The completion work must not have already been scheduled.
+ * 3. ctx_lock must not be busy. We have to use trylock because we
+ * already hold the waitqueue lock, so this inverts the normal
+ * locking order. Use irqsave/irqrestore because not all
+ * filesystems (e.g. fuse) call this function with IRQs disabled,
+ * yet IRQs have to be disabled before ctx_lock is obtained.
+ */
+ if (mask && !req->work_scheduled &&
+ spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
struct kioctx *ctx = iocb->ki_ctx;
- /*
- * Try to complete the iocb inline if we can. Use
- * irqsave/irqrestore because not all filesystems (e.g. fuse)
- * call this function with IRQs disabled and because IRQs
- * have to be disabled before ctx_lock is obtained.
- */
+ list_del_init(&req->wait.entry);
list_del(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
- req->done = true;
- if (iocb->ki_eventfd && eventfd_signal_count()) {
+ if (iocb->ki_eventfd && !eventfd_signal_allowed()) {
iocb = NULL;
INIT_WORK(&req->work, aio_poll_put_work);
schedule_work(&req->work);
@@ -1702,7 +1791,43 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (iocb)
iocb_put(iocb);
} else {
- schedule_work(&req->work);
+ /*
+ * Schedule the completion work if needed. If it was already
+ * scheduled, record that another wakeup came in.
+ *
+ * Don't remove the request from the waitqueue here, as it might
+ * not actually be complete yet (we won't know until vfs_poll()
+ * is called), and we must not miss any wakeups. POLLFREE is an
+ * exception to this; see below.
+ */
+ if (req->work_scheduled) {
+ req->work_need_resched = true;
+ } else {
+ schedule_work(&req->work);
+ req->work_scheduled = true;
+ }
+
+ /*
+ * If the waitqueue is being freed early but we can't complete
+ * the request inline, we have to tear down the request as best
+ * we can. That means immediately removing the request from its
+ * waitqueue and preventing all further accesses to the
+ * waitqueue via the request. We also need to schedule the
+ * completion work (done above). Also mark the request as
+ * cancelled, to potentially skip an unneeded call to ->poll().
+ */
+ if (mask & POLLFREE) {
+ WRITE_ONCE(req->cancelled, true);
+ list_del_init(&req->wait.entry);
+
+ /*
+ * Careful: this *must* be the last step, since as soon
+ * as req->head is NULL'ed out, the request can be
+ * completed and freed, since aio_poll_complete_work()
+ * will no longer need to take the waitqueue lock.
+ */
+ smp_store_release(&req->head, NULL);
+ }
}
return 1;
}
@@ -1710,6 +1835,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
struct aio_poll_table {
struct poll_table_struct pt;
struct aio_kiocb *iocb;
+ bool queued;
int error;
};
@@ -1720,11 +1846,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);
/* multiple wait queues per file are not supported */
- if (unlikely(pt->iocb->poll.head)) {
+ if (unlikely(pt->queued)) {
pt->error = -EINVAL;
return;
}
+ pt->queued = true;
pt->error = 0;
pt->iocb->poll.head = head;
add_wait_queue(head, &pt->iocb->poll.wait);
@@ -1749,12 +1876,14 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
req->head = NULL;
- req->done = false;
req->cancelled = false;
+ req->work_scheduled = false;
+ req->work_need_resched = false;
apt.pt._qproc = aio_poll_queue_proc;
apt.pt._key = req->events;
apt.iocb = aiocb;
+ apt.queued = false;
apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
/* initialized the list so that we can do list_empty checks */
@@ -1763,23 +1892,35 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
mask = vfs_poll(req->file, &apt.pt) & req->events;
spin_lock_irq(&ctx->ctx_lock);
- if (likely(req->head)) {
- spin_lock(&req->head->lock);
- if (unlikely(list_empty(&req->wait.entry))) {
- if (apt.error)
+ if (likely(apt.queued)) {
+ bool on_queue = poll_iocb_lock_wq(req);
+
+ if (!on_queue || req->work_scheduled) {
+ /*
+ * aio_poll_wake() already either scheduled the async
+ * completion work, or completed the request inline.
+ */
+ if (apt.error) /* unsupported case: multiple queues */
cancel = true;
apt.error = 0;
mask = 0;
}
if (mask || apt.error) {
+ /* Steal to complete synchronously. */
list_del_init(&req->wait.entry);
} else if (cancel) {
+ /* Cancel if possible (may be too late though). */
WRITE_ONCE(req->cancelled, true);
- } else if (!req->done) { /* actually waiting for an event */
+ } else if (on_queue) {
+ /*
+ * Actually waiting for an event, so add the request to
+ * active_reqs so that it can be cancelled if needed.
+ */
list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
aiocb->ki_cancel = aio_poll_cancel;
}
- spin_unlock(&req->head->lock);
+ if (on_queue)
+ poll_iocb_unlock_wq(req);
}
if (mask) { /* no async, we'd stolen it */
aiocb->ki_res.res = mangle_poll(mask);