aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-09-03 10:08:28 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-09-03 10:08:28 -0700
commit14726903c835101cd8d0a703b609305094350d61 (patch)
tree5cdcf5d2f06ca14be76efd33a4de0e3b28a70de0 /fs
parentMerge tag 'scsi-misc' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi (diff)
parentmm/madvise: add MADV_WILLNEED to process_madvise() (diff)
downloadlinux-dev-14726903c835101cd8d0a703b609305094350d61.tar.xz
linux-dev-14726903c835101cd8d0a703b609305094350d61.zip
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: "173 patches. Subsystems affected by this series: ia64, ocfs2, block, and mm (debug, pagecache, gup, swap, shmem, memcg, selftests, pagemap, mremap, bootmem, sparsemem, vmalloc, kasan, pagealloc, memory-failure, hugetlb, userfaultfd, vmscan, compaction, mempolicy, memblock, oom-kill, migration, ksm, percpu, vmstat, and madvise)" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (173 commits) mm/madvise: add MADV_WILLNEED to process_madvise() mm/vmstat: remove unneeded return value mm/vmstat: simplify the array size calculation mm/vmstat: correct some wrong comments mm/percpu,c: remove obsolete comments of pcpu_chunk_populated() selftests: vm: add COW time test for KSM pages selftests: vm: add KSM merging time test mm: KSM: fix data type selftests: vm: add KSM merging across nodes test selftests: vm: add KSM zero page merging test selftests: vm: add KSM unmerge test selftests: vm: add KSM merge test mm/migrate: correct kernel-doc notation mm: wire up syscall process_mrelease mm: introduce process_mrelease system call memblock: make memblock_find_in_range method private mm/mempolicy.c: use in_task() in mempolicy_slab_node() mm/mempolicy: unify the create() func for bind/interleave/prefer-many policies mm/mempolicy: advertise new MPOL_PREFERRED_MANY mm/hugetlb: add support for mempolicy MPOL_PREFERRED_MANY ...
Diffstat (limited to 'fs')
-rw-r--r--fs/drop_caches.c3
-rw-r--r--fs/exec.c8
-rw-r--r--fs/fcntl.c3
-rw-r--r--fs/fs-writeback.c28
-rw-r--r--fs/fs_context.c4
-rw-r--r--fs/inode.c2
-rw-r--r--fs/locks.c6
-rw-r--r--fs/namei.c8
-rw-r--r--fs/namespace.c7
-rw-r--r--fs/ocfs2/dlmglue.c14
-rw-r--r--fs/ocfs2/quota_global.c1
-rw-r--r--fs/ocfs2/quota_local.c2
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/select.c4
-rw-r--r--fs/userfaultfd.c116
15 files changed, 116 insertions, 92 deletions
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index f00fcc4a4f72..e619c31b6bd9 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -3,6 +3,7 @@
* Implement the manual drop-all-pagecache function
*/
+#include <linux/pagemap.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/fs.h>
@@ -27,7 +28,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
* we need to reschedule to avoid softlockups.
*/
if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
- (inode->i_mapping->nrpages == 0 && !need_resched())) {
+ (mapping_empty(inode->i_mapping) && !need_resched())) {
spin_unlock(&inode->i_lock);
continue;
}
diff --git a/fs/exec.c b/fs/exec.c
index 3b78b22addfb..2dc489c164fe 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -217,8 +217,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
* We are doing an exec(). 'current' is the process
* doing the exec and bprm->mm is the new process's mm.
*/
+ mmap_read_lock(bprm->mm);
ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
&page, NULL, NULL);
+ mmap_read_unlock(bprm->mm);
if (ret <= 0)
return NULL;
@@ -574,7 +576,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
}
if (kmapped_page) {
- flush_kernel_dcache_page(kmapped_page);
+ flush_dcache_page(kmapped_page);
kunmap(kmapped_page);
put_arg_page(kmapped_page);
}
@@ -592,7 +594,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
ret = 0;
out:
if (kmapped_page) {
- flush_kernel_dcache_page(kmapped_page);
+ flush_dcache_page(kmapped_page);
kunmap(kmapped_page);
put_arg_page(kmapped_page);
}
@@ -634,7 +636,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
kaddr = kmap_atomic(page);
flush_arg_page(bprm, pos & PAGE_MASK, page);
memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy);
- flush_kernel_dcache_page(page);
+ flush_dcache_page(page);
kunmap_atomic(kaddr);
put_arg_page(page);
}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 68added37c15..9c6c6a3e2de5 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -1051,7 +1051,8 @@ static int __init fcntl_init(void)
__FMODE_EXEC | __FMODE_NONOTIFY));
fasync_cache = kmem_cache_create("fasync_cache",
- sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
+ sizeof(struct fasync_struct), 0,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
return 0;
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index eb57dade6076..81ec192ce067 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -406,6 +406,11 @@ static bool inode_do_switch_wbs(struct inode *inode,
inc_wb_stat(new_wb, WB_WRITEBACK);
}
+ if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
+ atomic_dec(&old_wb->writeback_inodes);
+ atomic_inc(&new_wb->writeback_inodes);
+ }
+
wb_get(new_wb);
/*
@@ -1034,20 +1039,20 @@ restart:
* cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
* @bdi_id: target bdi id
* @memcg_id: target memcg css id
- * @nr: number of pages to write, 0 for best-effort dirty flushing
* @reason: reason why some writeback work initiated
* @done: target wb_completion
*
* Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id
* with the specified parameters.
*/
-int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
+int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
enum wb_reason reason, struct wb_completion *done)
{
struct backing_dev_info *bdi;
struct cgroup_subsys_state *memcg_css;
struct bdi_writeback *wb;
struct wb_writeback_work *work;
+ unsigned long dirty;
int ret;
/* lookup bdi and memcg */
@@ -1076,24 +1081,22 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
}
/*
- * If @nr is zero, the caller is attempting to write out most of
+ * The caller is attempting to write out most of
* the currently dirty pages. Let's take the current dirty page
* count and inflate it by 25% which should be large enough to
* flush out most dirty pages while avoiding getting livelocked by
* concurrent dirtiers.
+ *
+ * BTW the memcg stats are flushed periodically and this is best-effort
+ * estimation, so some potential error is ok.
*/
- if (!nr) {
- unsigned long filepages, headroom, dirty, writeback;
-
- mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty,
- &writeback);
- nr = dirty * 10 / 8;
- }
+ dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY);
+ dirty = dirty * 10 / 8;
/* issue the writeback work */
work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
if (work) {
- work->nr_pages = nr;
+ work->nr_pages = dirty;
work->sync_mode = WB_SYNC_NONE;
work->range_cyclic = 1;
work->reason = reason;
@@ -1999,7 +2002,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
static long wb_writeback(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
- unsigned long wb_start = jiffies;
long nr_pages = work->nr_pages;
unsigned long dirtied_before = jiffies;
struct inode *inode;
@@ -2053,8 +2055,6 @@ static long wb_writeback(struct bdi_writeback *wb,
progress = __writeback_inodes_wb(wb, work);
trace_writeback_written(wb, work);
- wb_update_bandwidth(wb, wb_start);
-
/*
* Did we write something? Try for more
*
diff --git a/fs/fs_context.c b/fs/fs_context.c
index de1985eae535..b7e43a780a62 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -254,7 +254,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
struct fs_context *fc;
int ret = -ENOMEM;
- fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
+ fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT);
if (!fc)
return ERR_PTR(-ENOMEM);
@@ -649,7 +649,7 @@ const struct fs_context_operations legacy_fs_context_ops = {
*/
static int legacy_init_fs_context(struct fs_context *fc)
{
- fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL);
+ fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT);
if (!fc->fs_private)
return -ENOMEM;
fc->ops = &legacy_fs_context_ops;
diff --git a/fs/inode.c b/fs/inode.c
index 84c528cd1955..37710ca863b5 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -770,7 +770,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
return LRU_ROTATE;
}
- if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
__iget(inode);
spin_unlock(&inode->i_lock);
spin_unlock(lru_lock);
diff --git a/fs/locks.c b/fs/locks.c
index 3d6fb4ae847b..51a5b72ef302 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2941,10 +2941,12 @@ static int __init filelock_init(void)
int i;
flctx_cache = kmem_cache_create("file_lock_ctx",
- sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);
+ sizeof(struct file_lock_context), 0,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
filelock_cache = kmem_cache_create("file_lock_cache",
- sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
+ sizeof(struct file_lock), 0,
+ SLAB_PANIC | SLAB_ACCOUNT, NULL);
for_each_possible_cpu(i) {
struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);
diff --git a/fs/namei.c b/fs/namei.c
index d049d3972695..95a881e0552b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4089,7 +4089,9 @@ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
return -EPERM;
inode_lock(target);
- if (is_local_mountpoint(dentry))
+ if (IS_SWAPFILE(target))
+ error = -EPERM;
+ else if (is_local_mountpoint(dentry))
error = -EBUSY;
else {
error = security_inode_unlink(dir, dentry);
@@ -4597,6 +4599,10 @@ int vfs_rename(struct renamedata *rd)
else if (target)
inode_lock(target);
+ error = -EPERM;
+ if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
+ goto out;
+
error = -EBUSY;
if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
goto out;
diff --git a/fs/namespace.c b/fs/namespace.c
index 12852363d90e..659a8f39c61a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -203,7 +203,8 @@ static struct mount *alloc_vfsmnt(const char *name)
goto out_free_cache;
if (name) {
- mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
+ mnt->mnt_devname = kstrdup_const(name,
+ GFP_KERNEL_ACCOUNT);
if (!mnt->mnt_devname)
goto out_free_id;
}
@@ -3370,7 +3371,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
if (!ucounts)
return ERR_PTR(-ENOSPC);
- new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+ new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
if (!new_ns) {
dec_mnt_namespaces(ucounts);
return ERR_PTR(-ENOMEM);
@@ -4306,7 +4307,7 @@ void __init mnt_init(void)
int err;
mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
mount_hashtable = alloc_large_system_hash("Mount-cache",
sizeof(struct hlist_head),
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 48fd369c29a4..359524b7341f 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -16,6 +16,7 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/time.h>
+#include <linux/delay.h>
#include <linux/quotaops.h>
#include <linux/sched/signal.h>
@@ -2721,7 +2722,7 @@ int ocfs2_inode_lock_tracker(struct inode *inode,
return status;
}
}
- return tmp_oh ? 1 : 0;
+ return 1;
}
void ocfs2_inode_unlock_tracker(struct inode *inode,
@@ -3912,6 +3913,17 @@ downconvert:
spin_unlock_irqrestore(&lockres->l_lock, flags);
ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
gen);
+ /* The dlm lock convert is being cancelled in background,
+ * ocfs2_cancel_convert() is asynchronous in fs/dlm,
+ * requeue it, try again later.
+ */
+ if (ret == -EBUSY) {
+ ctl->requeue = 1;
+ mlog(ML_BASTS, "lockres %s, ReQ: Downconvert busy\n",
+ lockres->l_name);
+ ret = 0;
+ msleep(20);
+ }
leave:
if (ret)
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index eda83487c9ec..f033de733adb 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -357,7 +357,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
}
oinfo->dqi_gi.dqi_sb = sb;
oinfo->dqi_gi.dqi_type = type;
- ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
oinfo->dqi_gqi_bh = NULL;
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index b1a8b046f4c2..0e4b16d4c037 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -702,6 +702,8 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
info->dqi_priv = oinfo;
oinfo->dqi_type = type;
INIT_LIST_HEAD(&oinfo->dqi_chunk);
+ oinfo->dqi_gqinode = NULL;
+ ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
oinfo->dqi_rec = NULL;
oinfo->dqi_lqi_bh = NULL;
oinfo->dqi_libh = NULL;
diff --git a/fs/pipe.c b/fs/pipe.c
index 6d4342bad9f1..1fa1f52763f0 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -191,7 +191,7 @@ EXPORT_SYMBOL(generic_pipe_buf_try_steal);
*/
bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
- return try_get_page(buf->page);
+ return try_get_compound_head(buf->page, 1);
}
EXPORT_SYMBOL(generic_pipe_buf_get);
diff --git a/fs/select.c b/fs/select.c
index 945896d0ac9e..e83e563a351d 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -655,7 +655,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
goto out_nofds;
alloc_size = 6 * size;
- bits = kvmalloc(alloc_size, GFP_KERNEL);
+ bits = kvmalloc(alloc_size, GFP_KERNEL_ACCOUNT);
if (!bits)
goto out_nofds;
}
@@ -1000,7 +1000,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
len = min(todo, POLLFD_PER_PAGE);
walk = walk->next = kmalloc(struct_size(walk, entries, len),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!walk) {
err = -ENOMEM;
goto out_fds;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 5c2d806e6ae5..003f0d31743e 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -33,11 +33,6 @@ int sysctl_unprivileged_userfaultfd __read_mostly;
static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
-enum userfaultfd_state {
- UFFD_STATE_WAIT_API,
- UFFD_STATE_RUNNING,
-};
-
/*
* Start with fault_pending_wqh and fault_wqh so they're more likely
* to be in the same cacheline.
@@ -69,12 +64,10 @@ struct userfaultfd_ctx {
unsigned int flags;
/* features requested from the userspace */
unsigned int features;
- /* state machine */
- enum userfaultfd_state state;
/* released */
bool released;
/* memory mappings are changing because of non-cooperative event */
- bool mmap_changing;
+ atomic_t mmap_changing;
/* mm with one ore more vmas attached to this userfaultfd_ctx */
struct mm_struct *mm;
};
@@ -104,6 +97,14 @@ struct userfaultfd_wake_range {
unsigned long len;
};
+/* internal indication that UFFD_API ioctl was successfully executed */
+#define UFFD_FEATURE_INITIALIZED (1u << 31)
+
+static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
+{
+ return ctx->features & UFFD_FEATURE_INITIALIZED;
+}
+
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
int wake_flags, void *key)
{
@@ -623,7 +624,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
* already released.
*/
out:
- WRITE_ONCE(ctx->mmap_changing, false);
+ atomic_dec(&ctx->mmap_changing);
+ VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
userfaultfd_ctx_put(ctx);
}
@@ -666,15 +668,14 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
refcount_set(&ctx->refcount, 1);
ctx->flags = octx->flags;
- ctx->state = UFFD_STATE_RUNNING;
ctx->features = octx->features;
ctx->released = false;
- ctx->mmap_changing = false;
+ atomic_set(&ctx->mmap_changing, 0);
ctx->mm = vma->vm_mm;
mmgrab(ctx->mm);
userfaultfd_ctx_get(octx);
- WRITE_ONCE(octx->mmap_changing, true);
+ atomic_inc(&octx->mmap_changing);
fctx->orig = octx;
fctx->new = ctx;
list_add_tail(&fctx->list, fcs);
@@ -721,7 +722,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
vm_ctx->ctx = ctx;
userfaultfd_ctx_get(ctx);
- WRITE_ONCE(ctx->mmap_changing, true);
+ atomic_inc(&ctx->mmap_changing);
} else {
/* Drop uffd context if remap feature not enabled */
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
@@ -766,7 +767,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
return true;
userfaultfd_ctx_get(ctx);
- WRITE_ONCE(ctx->mmap_changing, true);
+ atomic_inc(&ctx->mmap_changing);
mmap_read_unlock(mm);
msg_init(&ewq.msg);
@@ -810,7 +811,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma,
return -ENOMEM;
userfaultfd_ctx_get(ctx);
- WRITE_ONCE(ctx->mmap_changing, true);
+ atomic_inc(&ctx->mmap_changing);
unmap_ctx->ctx = ctx;
unmap_ctx->start = start;
unmap_ctx->end = end;
@@ -943,38 +944,33 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
poll_wait(file, &ctx->fd_wqh, wait);
- switch (ctx->state) {
- case UFFD_STATE_WAIT_API:
+ if (!userfaultfd_is_initialized(ctx))
return EPOLLERR;
- case UFFD_STATE_RUNNING:
- /*
- * poll() never guarantees that read won't block.
- * userfaults can be waken before they're read().
- */
- if (unlikely(!(file->f_flags & O_NONBLOCK)))
- return EPOLLERR;
- /*
- * lockless access to see if there are pending faults
- * __pollwait last action is the add_wait_queue but
- * the spin_unlock would allow the waitqueue_active to
- * pass above the actual list_add inside
- * add_wait_queue critical section. So use a full
- * memory barrier to serialize the list_add write of
- * add_wait_queue() with the waitqueue_active read
- * below.
- */
- ret = 0;
- smp_mb();
- if (waitqueue_active(&ctx->fault_pending_wqh))
- ret = EPOLLIN;
- else if (waitqueue_active(&ctx->event_wqh))
- ret = EPOLLIN;
- return ret;
- default:
- WARN_ON_ONCE(1);
+ /*
+ * poll() never guarantees that read won't block.
+ * userfaults can be waken before they're read().
+ */
+ if (unlikely(!(file->f_flags & O_NONBLOCK)))
return EPOLLERR;
- }
+ /*
+ * lockless access to see if there are pending faults
+ * __pollwait last action is the add_wait_queue but
+ * the spin_unlock would allow the waitqueue_active to
+ * pass above the actual list_add inside
+ * add_wait_queue critical section. So use a full
+ * memory barrier to serialize the list_add write of
+ * add_wait_queue() with the waitqueue_active read
+ * below.
+ */
+ ret = 0;
+ smp_mb();
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ ret = EPOLLIN;
+ else if (waitqueue_active(&ctx->event_wqh))
+ ret = EPOLLIN;
+
+ return ret;
}
static const struct file_operations userfaultfd_fops;
@@ -1169,7 +1165,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
int no_wait = file->f_flags & O_NONBLOCK;
struct inode *inode = file_inode(file);
- if (ctx->state == UFFD_STATE_WAIT_API)
+ if (!userfaultfd_is_initialized(ctx))
return -EINVAL;
for (;;) {
@@ -1700,7 +1696,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
user_uffdio_copy = (struct uffdio_copy __user *) arg;
ret = -EAGAIN;
- if (READ_ONCE(ctx->mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
goto out;
ret = -EFAULT;
@@ -1757,7 +1753,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
ret = -EAGAIN;
- if (READ_ONCE(ctx->mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
goto out;
ret = -EFAULT;
@@ -1807,7 +1803,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
struct userfaultfd_wake_range range;
bool mode_wp, mode_dontwake;
- if (READ_ONCE(ctx->mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
return -EAGAIN;
user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
@@ -1855,7 +1851,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
user_uffdio_continue = (struct uffdio_continue __user *)arg;
ret = -EAGAIN;
- if (READ_ONCE(ctx->mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
goto out;
ret = -EFAULT;
@@ -1908,9 +1904,10 @@ out:
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
- * For the current set of features the bits just coincide
+ * For the current set of features the bits just coincide. Set
+ * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
*/
- return (unsigned int)user_features;
+ return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
}
/*
@@ -1923,12 +1920,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
{
struct uffdio_api uffdio_api;
void __user *buf = (void __user *)arg;
+ unsigned int ctx_features;
int ret;
__u64 features;
- ret = -EINVAL;
- if (ctx->state != UFFD_STATE_WAIT_API)
- goto out;
ret = -EFAULT;
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
goto out;
@@ -1952,9 +1947,13 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
ret = -EFAULT;
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
goto out;
- ctx->state = UFFD_STATE_RUNNING;
+
/* only enable the requested features for this uffd context */
- ctx->features = uffd_ctx_features(features);
+ ctx_features = uffd_ctx_features(features);
+ ret = -EINVAL;
+ if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
+ goto err_out;
+
ret = 0;
out:
return ret;
@@ -1971,7 +1970,7 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
int ret = -EINVAL;
struct userfaultfd_ctx *ctx = file->private_data;
- if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
+ if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
return -EINVAL;
switch(cmd) {
@@ -2085,9 +2084,8 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
refcount_set(&ctx->refcount, 1);
ctx->flags = flags;
ctx->features = 0;
- ctx->state = UFFD_STATE_WAIT_API;
ctx->released = false;
- ctx->mmap_changing = false;
+ atomic_set(&ctx->mmap_changing, 0);
ctx->mm = current->mm;
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);