From 1383399d7be029281997889df23150fa6c16be6e Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 23 May 2016 16:22:29 -0700 Subject: mm: memcontrol: fix possible css ref leak on oom mem_cgroup_oom may be invoked multiple times while a process is handling a page fault, in which case current->memcg_in_oom will be overwritten leaking the previously taken css reference. Link: http://lkml.kernel.org/r/1464019330-7579-1-git-send-email-vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b3f16ab4b431..cf428d7b9a03 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1604,7 +1604,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_may_oom) + if (!current->memcg_may_oom || current->memcg_in_oom) return; /* * We are in the middle of the charge context here, so we -- cgit v1.2.3-59-g8ed1b From dc0ef0df7b6a90892ec41933212ac701152a254c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 May 2016 16:25:27 -0700 Subject: mm: make mmap_sem for write waits killable for mm syscalls This is a follow up work for oom_reaper [1]. As the async OOM killing depends on oom_sem for read we would really appreciate if a holder for write didn't stood in the way. This patchset is changing many of down_write calls to be killable to help those cases when the writer is blocked and waiting for readers to release the lock and so help __oom_reap_task to process the oom victim. Most of the patches are really trivial because the lock is help from a shallow syscall paths where we can return EINTR trivially and allow the current task to die (note that EINTR will never get to the userspace as the task has fatal signal pending). Others seem to be easy as well as the callers are already handling fatal errors and bail and return to userspace which should be sufficient to handle the failure gracefully. I am not familiar with all those code paths so a deeper review is really appreciated. As this work is touching more areas which are not directly connected I have tried to keep the CC list as small as possible and people who I believed would be familiar are CCed only to the specific patches (all should have received the cover though). This patchset is based on linux-next and it depends on down_write_killable for rw_semaphores which got merged into tip locking/rwsem branch and it is merged into this next tree. I guess it would be easiest to route these patches via mmotm because of the dependency on the tip tree but if respective maintainers prefer other way I have no objections. I haven't covered all the mmap_write(mm->mmap_sem) instances here $ git grep "down_write(.*\)" next/master | wc -l 98 $ git grep "down_write(.*\)" | wc -l 62 I have tried to cover those which should be relatively easy to review in this series because this alone should be a nice improvement. Other places can be changed on top. [0] http://lkml.kernel.org/r/1456752417-9626-1-git-send-email-mhocko@kernel.org [1] http://lkml.kernel.org/r/1452094975-551-1-git-send-email-mhocko@kernel.org [2] http://lkml.kernel.org/r/1456750705-7141-1-git-send-email-mhocko@kernel.org This patch (of 18): This is the first step in making mmap_sem write waiters killable. It focuses on the trivial ones which are taking the lock early after entering the syscall and they are not changing state before. Therefore it is very easy to change them to use down_write_killable and immediately return with -EINTR. This will allow the waiter to pass away without blocking the mmap_sem which might be required to make a forward progress. E.g. the oom reaper will need the lock for reading to dismantle the OOM victim address space. The only tricky function in this patch is vm_mmap_pgoff which has many call sites via vm_mmap. To reduce the risk keep vm_mmap with the original non-killable semantic for now. vm_munmap callers do not bother checking the return value so open code it into the munmap syscall path for now for simplicity. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Konstantin Khlebnikov Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: David Rientjes Cc: Dave Hansen Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 5 +++-- mm/madvise.c | 8 +++++--- mm/mlock.c | 16 ++++++++++------ mm/mmap.c | 27 +++++++++++++++++++++++---- mm/mprotect.c | 3 ++- mm/mremap.c | 3 ++- mm/nommu.c | 2 +- mm/util.c | 12 +++++++++--- 8 files changed, 55 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index f6f3353b0868..bff7fd702331 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -442,9 +442,10 @@ extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; -extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, +extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long); + unsigned long, unsigned long, + bool); extern void set_pageblock_order(void); unsigned long reclaim_clean_pages_from_list(struct zone *zone, diff --git a/mm/madvise.c b/mm/madvise.c index 07427d3fcead..93fb63e88b5e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -707,10 +707,12 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) return error; write = madvise_need_mmap_write(behavior); - if (write) - down_write(¤t->mm->mmap_sem); - else + if (write) { + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; + } else { down_read(¤t->mm->mmap_sem); + } /* * If the interval [start,end) covers some unmapped address diff --git a/mm/mlock.c b/mm/mlock.c index 96f001041928..ef8dc9f395c4 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -617,7 +617,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, return error; } -static int do_mlock(unsigned long start, size_t len, vm_flags_t flags) +static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; unsigned long lock_limit; @@ -635,7 +635,8 @@ static int do_mlock(unsigned long start, size_t len, vm_flags_t flags) lock_limit >>= PAGE_SHIFT; locked = len >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; locked += current->mm->locked_vm; @@ -678,7 +679,8 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; ret = apply_vma_lock_flags(start, len, 0); up_write(¤t->mm->mmap_sem); @@ -748,9 +750,10 @@ SYSCALL_DEFINE1(mlockall, int, flags) lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; - ret = -ENOMEM; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; + ret = -ENOMEM; if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) ret = apply_mlockall_flags(flags); @@ -765,7 +768,8 @@ SYSCALL_DEFINE0(munlockall) { int ret; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; ret = apply_mlockall_flags(0); up_write(¤t->mm->mmap_sem); return ret; diff --git a/mm/mmap.c b/mm/mmap.c index b9274a0c82c9..11e1f2ca72af 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -178,7 +178,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) unsigned long min_brk; bool populate; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; #ifdef CONFIG_COMPAT_BRK /* @@ -1332,7 +1333,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true); out_fput: if (file) fput(file); @@ -2493,6 +2494,10 @@ int vm_munmap(unsigned long start, size_t len) int ret; struct mm_struct *mm = current->mm; + /* + * XXX convert to down_write_killable as soon as all users are able + * to handle the error. + */ down_write(&mm->mmap_sem); ret = do_munmap(mm, start, len); up_write(&mm->mmap_sem); @@ -2502,8 +2507,15 @@ EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { + int ret; + struct mm_struct *mm = current->mm; + profile_munmap(addr); - return vm_munmap(addr, len); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + ret = do_munmap(mm, addr, len); + up_write(&mm->mmap_sem); + return ret; } @@ -2535,7 +2547,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (pgoff + (size >> PAGE_SHIFT) < pgoff) return ret; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + vma = find_vma(mm, start); if (!vma || !(vma->vm_flags & VM_SHARED)) @@ -2700,6 +2714,11 @@ unsigned long vm_brk(unsigned long addr, unsigned long len) unsigned long ret; bool populate; + /* + * XXX not all users are chcecking the return value, convert + * to down_write_killable after they are able to cope with + * error + */ down_write(&mm->mmap_sem); ret = do_brk(addr, len); populate = ((mm->def_flags & VM_LOCKED) != 0); diff --git a/mm/mprotect.c b/mm/mprotect.c index b650c5412f58..5019a1ef2848 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -379,7 +379,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, reqprot = prot; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; vma = find_vma(current->mm, start); error = -ENOMEM; diff --git a/mm/mremap.c b/mm/mremap.c index 9dc499977924..1f157adfdaf9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -503,7 +503,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (!new_len) return ret; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; if (flags & MREMAP_FIXED) { ret = mremap_to(addr, old_len, new_addr, new_len, diff --git a/mm/nommu.c b/mm/nommu.c index c8bd59a03c71..b74512746aae 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1446,7 +1446,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true); if (file) fput(file); diff --git a/mm/util.c b/mm/util.c index 8a1b3a1fb595..03b237746850 100644 --- a/mm/util.c +++ b/mm/util.c @@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flag, unsigned long pgoff) + unsigned long flag, unsigned long pgoff, bool killable) { unsigned long ret; struct mm_struct *mm = current->mm; @@ -297,7 +297,12 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, ret = security_mmap_file(file, prot, flag); if (!ret) { - down_write(&mm->mmap_sem); + if (killable) { + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + } else { + down_write(&mm->mmap_sem); + } ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, &populate); up_write(&mm->mmap_sem); @@ -307,6 +312,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, return ret; } +/* XXX are all callers checking an error */ unsigned long vm_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long offset) @@ -316,7 +322,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, if (unlikely(offset_in_page(offset))) return -EINVAL; - return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); + return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT, false); } EXPORT_SYMBOL(vm_mmap); -- cgit v1.2.3-59-g8ed1b From 9fbeb5ab59a2b2a09cca2eb68283e7a090d4b98d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 May 2016 16:25:30 -0700 Subject: mm: make vm_mmap killable All the callers of vm_mmap seem to check for the failure already and bail out in one way or another on the error which means that we can change it to use killable version of vm_mmap_pgoff and return -EINTR if the current task gets killed while waiting for mmap_sem. This also means that vm_mmap_pgoff can be killable by default and drop the additional parameter. This will help in the OOM conditions when the oom victim might be stuck waiting for the mmap_sem for write which in turn can block oom_reaper which relies on the mmap_sem for read to make a forward progress and reclaim the address space of the victim. Please note that load_elf_binary is ignoring vm_mmap error for current->personality & MMAP_PAGE_ZERO case but that shouldn't be a problem because the address is not used anywhere and we never return to the userspace if we got killed. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Oleg Nesterov Cc: Andrea Arcangeli Cc: Al Viro Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/internal.h | 3 +-- mm/mmap.c | 2 +- mm/nommu.c | 2 +- mm/util.c | 13 ++++--------- 5 files changed, 8 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index b530c99e8e81..d5eb8dddd7c0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2013,7 +2013,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {} /* These take the mm semaphore themselves */ extern unsigned long vm_brk(unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); -extern unsigned long vm_mmap(struct file *, unsigned long, +extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/mm/internal.h b/mm/internal.h index bff7fd702331..a37e5b6f9d25 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -444,8 +444,7 @@ extern u32 hwpoison_filter_enable; extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, - bool); + unsigned long, unsigned long); extern void set_pageblock_order(void); unsigned long reclaim_clean_pages_from_list(struct zone *zone, diff --git a/mm/mmap.c b/mm/mmap.c index 11e1f2ca72af..420088682d4a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1333,7 +1333,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true); + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); out_fput: if (file) fput(file); diff --git a/mm/nommu.c b/mm/nommu.c index b74512746aae..c8bd59a03c71 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1446,7 +1446,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true); + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); if (file) fput(file); diff --git a/mm/util.c b/mm/util.c index 03b237746850..917e0e3d0f8e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flag, unsigned long pgoff, bool killable) + unsigned long flag, unsigned long pgoff) { unsigned long ret; struct mm_struct *mm = current->mm; @@ -297,12 +297,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, ret = security_mmap_file(file, prot, flag); if (!ret) { - if (killable) { - if (down_write_killable(&mm->mmap_sem)) - return -EINTR; - } else { - down_write(&mm->mmap_sem); - } + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, &populate); up_write(&mm->mmap_sem); @@ -312,7 +308,6 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, return ret; } -/* XXX are all callers checking an error */ unsigned long vm_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long offset) @@ -322,7 +317,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, if (unlikely(offset_in_page(offset))) return -EINVAL; - return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT, false); + return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); } EXPORT_SYMBOL(vm_mmap); -- cgit v1.2.3-59-g8ed1b From ae7987835643e470cb220e6685bd36d92179ef9c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 May 2016 16:25:33 -0700 Subject: mm: make vm_munmap killable Almost all current users of vm_munmap are ignoring the return value and so they do not handle potential error. This means that some VMAs might stay behind. This patch doesn't try to solve those potential problems. Quite contrary it adds a new failure mode by using down_write_killable in vm_munmap. This should be safer than other failure modes, though, because the process is guaranteed to die as soon as it leaves the kernel and exit_mmap will clean the whole address space. This will help in the OOM conditions when the oom victim might be stuck waiting for the mmap_sem for write which in turn can block oom_reaper which relies on the mmap_sem for read to make a forward progress and reclaim the address space of the victim. Signed-off-by: Michal Hocko Cc: Oleg Nesterov Cc: "Kirill A. Shutemov" Cc: Konstantin Khlebnikov Cc: Andrea Arcangeli Cc: Alexander Viro Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 420088682d4a..ca292a7c2b68 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2494,11 +2494,9 @@ int vm_munmap(unsigned long start, size_t len) int ret; struct mm_struct *mm = current->mm; - /* - * XXX convert to down_write_killable as soon as all users are able - * to handle the error. - */ - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + ret = do_munmap(mm, start, len); up_write(&mm->mmap_sem); return ret; -- cgit v1.2.3-59-g8ed1b From 2d6c928241add2848e4eebfce407e95164229976 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 May 2016 16:25:42 -0700 Subject: mm: make vm_brk killable Now that all the callers handle vm_brk failure we can change it wait for mmap_sem killable to help oom_reaper to not get blocked just because vm_brk gets blocked behind mmap_sem readers. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Oleg Nesterov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/mmap.c | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index d5eb8dddd7c0..2835d598d258 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2011,7 +2011,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {} #endif /* These take the mm semaphore themselves */ -extern unsigned long vm_brk(unsigned long, unsigned long); +extern unsigned long __must_check vm_brk(unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, diff --git a/mm/mmap.c b/mm/mmap.c index ca292a7c2b68..d3d9a94ca031 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2712,12 +2712,9 @@ unsigned long vm_brk(unsigned long addr, unsigned long len) unsigned long ret; bool populate; - /* - * XXX not all users are chcecking the return value, convert - * to down_write_killable after they are able to cope with - * error - */ - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + ret = do_brk(addr, len); populate = ((mm->def_flags & VM_LOCKED) != 0); up_write(&mm->mmap_sem); -- cgit v1.2.3-59-g8ed1b