aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/fs/userfaultfd.c
diff options
context:
space:
mode:
authorMike Rapoport <rppt@linux.vnet.ibm.com>2018-06-07 17:09:25 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-06-07 17:34:38 -0700
commitdf2cc96e77011cf7989208b206da9817e0321028 (patch)
tree5831fc1b33713f8073c5661a36890f7822a4f484 /fs/userfaultfd.c
parentmm: memcg: allow lowering memory.swap.max below the current usage (diff)
downloadwireguard-linux-df2cc96e77011cf7989208b206da9817e0321028.tar.xz
wireguard-linux-df2cc96e77011cf7989208b206da9817e0321028.zip
userfaultfd: prevent non-cooperative events vs mcopy_atomic races
If a process monitored with userfaultfd changes it's memory mappings or forks() at the same time as uffd monitor fills the process memory with UFFDIO_COPY, the actual creation of page table entries and copying of the data in mcopy_atomic may happen either before of after the memory mapping modifications and there is no way for the uffd monitor to maintain consistent view of the process memory layout. For instance, let's consider fork() running in parallel with userfaultfd_copy(): process | uffd monitor ---------------------------------+------------------------------ fork() | userfaultfd_copy() ... | ... dup_mmap() | down_read(mmap_sem) down_write(mmap_sem) | /* create PTEs, copy data */ dup_uffd() | up_read(mmap_sem) copy_page_range() | up_write(mmap_sem) | dup_uffd_complete() | /* notify monitor */ | If the userfaultfd_copy() takes the mmap_sem first, the new page(s) will be present by the time copy_page_range() is called and they will appear in the child's memory mappings. However, if the fork() is the first to take the mmap_sem, the new pages won't be mapped in the child's address space. If the pages are not present and child tries to access them, the monitor will get page fault notification and everything is fine. However, if the pages *are present*, the child can access them without uffd noticing. And if we copy them into child it'll see the wrong data. Since we are talking about background copy, we'd need to decide whether the pages should be copied or not regardless #PF notifications. Since userfaultfd monitor has no way to determine what was the order, let's disallow userfaultfd_copy in parallel with the non-cooperative events. In such case we return -EAGAIN and the uffd monitor can understand that userfaultfd_copy() clashed with a non-cooperative event and take an appropriate action. Link: http://lkml.kernel.org/r/1527061324-19949-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com> Acked-by: Pavel Emelyanov <xemul@virtuozzo.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Andrei Vagin <avagin@virtuozzo.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/userfaultfd.c')
-rw-r--r--fs/userfaultfd.c22
1 files changed, 20 insertions, 2 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index cec550c8468f..123bf7d516fc 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -62,6 +62,8 @@ struct userfaultfd_ctx {
enum userfaultfd_state state;
/* released */
bool released;
+ /* memory mappings are changing because of non-cooperative event */
+ bool mmap_changing;
/* mm with one ore more vmas attached to this userfaultfd_ctx */
struct mm_struct *mm;
};
@@ -641,6 +643,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
* already released.
*/
out:
+ WRITE_ONCE(ctx->mmap_changing, false);
userfaultfd_ctx_put(ctx);
}
@@ -686,10 +689,12 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
ctx->state = UFFD_STATE_RUNNING;
ctx->features = octx->features;
ctx->released = false;
+ ctx->mmap_changing = false;
ctx->mm = vma->vm_mm;
mmgrab(ctx->mm);
userfaultfd_ctx_get(octx);
+ WRITE_ONCE(octx->mmap_changing, true);
fctx->orig = octx;
fctx->new = ctx;
list_add_tail(&fctx->list, fcs);
@@ -732,6 +737,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
vm_ctx->ctx = ctx;
userfaultfd_ctx_get(ctx);
+ WRITE_ONCE(ctx->mmap_changing, true);
}
}
@@ -772,6 +778,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
return true;
userfaultfd_ctx_get(ctx);
+ WRITE_ONCE(ctx->mmap_changing, true);
up_read(&mm->mmap_sem);
msg_init(&ewq.msg);
@@ -815,6 +822,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma,
return -ENOMEM;
userfaultfd_ctx_get(ctx);
+ WRITE_ONCE(ctx->mmap_changing, true);
unmap_ctx->ctx = ctx;
unmap_ctx->start = start;
unmap_ctx->end = end;
@@ -1653,6 +1661,10 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
user_uffdio_copy = (struct uffdio_copy __user *) arg;
+ ret = -EAGAIN;
+ if (READ_ONCE(ctx->mmap_changing))
+ goto out;
+
ret = -EFAULT;
if (copy_from_user(&uffdio_copy, user_uffdio_copy,
/* don't copy "copy" last field */
@@ -1674,7 +1686,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
goto out;
if (mmget_not_zero(ctx->mm)) {
ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
- uffdio_copy.len);
+ uffdio_copy.len, &ctx->mmap_changing);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1705,6 +1717,10 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
+ ret = -EAGAIN;
+ if (READ_ONCE(ctx->mmap_changing))
+ goto out;
+
ret = -EFAULT;
if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
/* don't copy "zeropage" last field */
@@ -1721,7 +1737,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
if (mmget_not_zero(ctx->mm)) {
ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
- uffdio_zeropage.range.len);
+ uffdio_zeropage.range.len,
+ &ctx->mmap_changing);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1900,6 +1917,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
ctx->features = 0;
ctx->state = UFFD_STATE_WAIT_API;
ctx->released = false;
+ ctx->mmap_changing = false;
ctx->mm = current->mm;
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);