From 405ce051236cc65b30bbfe490b28ce60ae6aed85 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 21 Apr 2022 16:35:33 -0700 Subject: mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb() There is a race condition between memory_failure_hugetlb() and hugetlb free/demotion, which causes setting PageHWPoison flag on the wrong page. The one simple result is that wrong processes can be killed, but another (more serious) one is that the actual error is left unhandled, so no one prevents later access to it, and that might lead to more serious results like consuming corrupted data. Think about the below race window: CPU 1 CPU 2 memory_failure_hugetlb struct page *head = compound_head(p); hugetlb page might be freed to buddy, or even changed to another compound page. get_hwpoison_page -- page is not what we want now... The current code first does prechecks roughly and then reconfirms after taking refcount, but it's found that it makes code overly complicated, so move the prechecks in a single hugetlb_lock range. A newly introduced function, try_memory_failure_hugetlb(), always takes hugetlb_lock (even for non-hugetlb pages). That can be improved, but memory_failure() is rare in principle, so should not be a big problem. Link: https://lkml.kernel.org/r/20220408135323.1559401-2-naoya.horiguchi@linux.dev Fixes: 761ad8d7c7b5 ("mm: hwpoison: introduce memory_failure_hugetlb()") Signed-off-by: Naoya Horiguchi Reported-by: Mike Kravetz Reviewed-by: Miaohe Lin Reviewed-by: Mike Kravetz Cc: Yang Shi Cc: Dan Carpenter Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 10 ++++ mm/memory-failure.c | 145 +++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 113 insertions(+), 42 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f8ca7cca3c1a..3fc721789743 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6785,6 +6785,16 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb) return ret; } +int get_huge_page_for_hwpoison(unsigned long pfn, int flags) +{ + int ret; + + spin_lock_irq(&hugetlb_lock); + ret = __get_huge_page_for_hwpoison(pfn, flags); + spin_unlock_irq(&hugetlb_lock); + return ret; +} + void putback_active_hugepage(struct page *page) { spin_lock_irq(&hugetlb_lock); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index dcb6bb9cf731..2020944398c9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1498,50 +1498,113 @@ static int try_to_split_thp_page(struct page *page, const char *msg) return 0; } -static int memory_failure_hugetlb(unsigned long pfn, int flags) +/* + * Called from hugetlb code with hugetlb_lock held. + * + * Return values: + * 0 - free hugepage + * 1 - in-use hugepage + * 2 - not a hugepage + * -EBUSY - the hugepage is busy (try to retry) + * -EHWPOISON - the hugepage is already hwpoisoned + */ +int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) +{ + struct page *page = pfn_to_page(pfn); + struct page *head = compound_head(page); + int ret = 2; /* fallback to normal page handling */ + bool count_increased = false; + + if (!PageHeadHuge(head)) + goto out; + + if (flags & MF_COUNT_INCREASED) { + ret = 1; + count_increased = true; + } else if (HPageFreed(head) || HPageMigratable(head)) { + ret = get_page_unless_zero(head); + if (ret) + count_increased = true; + } else { + ret = -EBUSY; + goto out; + } + + if (TestSetPageHWPoison(head)) { + ret = -EHWPOISON; + goto out; + } + + return ret; +out: + if (count_increased) + put_page(head); + return ret; +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * Taking refcount of hugetlb pages needs extra care about race conditions + * with basic operations like hugepage allocation/free/demotion. + * So some of prechecks for hwpoison (pinning, and testing/setting + * PageHWPoison) should be done in single hugetlb_lock range. + */ +static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) { - struct page *p = pfn_to_page(pfn); - struct page *head = compound_head(p); int res; + struct page *p = pfn_to_page(pfn); + struct page *head; unsigned long page_flags; + bool retry = true; - if (TestSetPageHWPoison(head)) { - pr_err("Memory failure: %#lx: already hardware poisoned\n", - pfn); - res = -EHWPOISON; - if (flags & MF_ACTION_REQUIRED) + *hugetlb = 1; +retry: + res = get_huge_page_for_hwpoison(pfn, flags); + if (res == 2) { /* fallback to normal page handling */ + *hugetlb = 0; + return 0; + } else if (res == -EHWPOISON) { + pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); + if (flags & MF_ACTION_REQUIRED) { + head = compound_head(p); res = kill_accessing_process(current, page_to_pfn(head), flags); + } return res; + } else if (res == -EBUSY) { + if (retry) { + retry = false; + goto retry; + } + action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); + return res; + } + + head = compound_head(p); + lock_page(head); + + if (hwpoison_filter(p)) { + ClearPageHWPoison(head); + res = -EOPNOTSUPP; + goto out; } num_poisoned_pages_inc(); - if (!(flags & MF_COUNT_INCREASED)) { - res = get_hwpoison_page(p, flags); - if (!res) { - lock_page(head); - if (hwpoison_filter(p)) { - if (TestClearPageHWPoison(head)) - num_poisoned_pages_dec(); - unlock_page(head); - return -EOPNOTSUPP; - } - unlock_page(head); - res = MF_FAILED; - if (__page_handle_poison(p)) { - page_ref_inc(p); - res = MF_RECOVERED; - } - action_result(pfn, MF_MSG_FREE_HUGE, res); - return res == MF_RECOVERED ? 0 : -EBUSY; - } else if (res < 0) { - action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); - return -EBUSY; + /* + * Handling free hugepage. The possible race with hugepage allocation + * or demotion can be prevented by PageHWPoison flag. + */ + if (res == 0) { + unlock_page(head); + res = MF_FAILED; + if (__page_handle_poison(p)) { + page_ref_inc(p); + res = MF_RECOVERED; } + action_result(pfn, MF_MSG_FREE_HUGE, res); + return res == MF_RECOVERED ? 0 : -EBUSY; } - lock_page(head); - /* * The page could have changed compound pages due to race window. * If this happens just bail out. @@ -1554,14 +1617,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) page_flags = head->flags; - if (hwpoison_filter(p)) { - if (TestClearPageHWPoison(head)) - num_poisoned_pages_dec(); - put_page(p); - res = -EOPNOTSUPP; - goto out; - } - /* * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so * simply disable it. In order to make it work properly, we need @@ -1588,6 +1643,12 @@ out: unlock_page(head); return res; } +#else +static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) +{ + return 0; +} +#endif static int memory_failure_dev_pagemap(unsigned long pfn, int flags, struct dev_pagemap *pgmap) @@ -1712,6 +1773,7 @@ int memory_failure(unsigned long pfn, int flags) int res = 0; unsigned long page_flags; bool retry = true; + int hugetlb = 0; if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); @@ -1739,10 +1801,9 @@ int memory_failure(unsigned long pfn, int flags) } try_again: - if (PageHuge(p)) { - res = memory_failure_hugetlb(pfn, flags); + res = try_memory_failure_hugetlb(pfn, flags, &hugetlb); + if (hugetlb) goto unlock_mutex; - } if (TestSetPageHWPoison(p)) { pr_err("Memory failure: %#lx: already hardware poisoned\n", -- cgit v1.2.3-59-g8ed1b From d173d5417fb67411e623d394aab986d847e47dad Mon Sep 17 00:00:00 2001 From: Xu Yu Date: Thu, 21 Apr 2022 16:35:37 -0700 Subject: mm/memory-failure.c: skip huge_zero_page in memory_failure() Kernel panic when injecting memory_failure for the global huge_zero_page, when CONFIG_DEBUG_VM is enabled, as follows. Injecting memory failure for pfn 0x109ff9 at process virtual address 0x20ff9000 page:00000000fb053fc3 refcount:2 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x109e00 head:00000000fb053fc3 order:9 compound_mapcount:0 compound_pincount:0 flags: 0x17fffc000010001(locked|head|node=0|zone=2|lastcpupid=0x1ffff) raw: 017fffc000010001 0000000000000000 dead000000000122 0000000000000000 raw: 0000000000000000 0000000000000000 00000002ffffffff 0000000000000000 page dumped because: VM_BUG_ON_PAGE(is_huge_zero_page(head)) ------------[ cut here ]------------ kernel BUG at mm/huge_memory.c:2499! invalid opcode: 0000 [#1] PREEMPT SMP PTI CPU: 6 PID: 553 Comm: split_bug Not tainted 5.18.0-rc1+ #11 Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 3288b3c 04/01/2014 RIP: 0010:split_huge_page_to_list+0x66a/0x880 Code: 84 9b fb ff ff 48 8b 7c 24 08 31 f6 e8 9f 5d 2a 00 b8 b8 02 00 00 e9 e8 fb ff ff 48 c7 c6 e8 47 3c 82 4c b RSP: 0018:ffffc90000dcbdf8 EFLAGS: 00010246 RAX: 000000000000003c RBX: 0000000000000001 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff823e4c4f RDI: 00000000ffffffff RBP: ffff88843fffdb40 R08: 0000000000000000 R09: 00000000fffeffff R10: ffffc90000dcbc48 R11: ffffffff82d68448 R12: ffffea0004278000 R13: ffffffff823c6203 R14: 0000000000109ff9 R15: ffffea000427fe40 FS: 00007fc375a26740(0000) GS:ffff88842fd80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc3757c9290 CR3: 0000000102174006 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: try_to_split_thp_page+0x3a/0x130 memory_failure+0x128/0x800 madvise_inject_error.cold+0x8b/0xa1 __x64_sys_madvise+0x54/0x60 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7fc3754f8bf9 Code: 01 00 48 81 c4 80 00 00 00 e9 f1 fe ff ff 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 8 RSP: 002b:00007ffeda93a1d8 EFLAGS: 00000217 ORIG_RAX: 000000000000001c RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fc3754f8bf9 RDX: 0000000000000064 RSI: 0000000000003000 RDI: 0000000020ff9000 RBP: 00007ffeda93a200 R08: 0000000000000000 R09: 0000000000000000 R10: 00000000ffffffff R11: 0000000000000217 R12: 0000000000400490 R13: 00007ffeda93a2e0 R14: 0000000000000000 R15: 0000000000000000 This makes huge_zero_page bail out explicitly before split in memory_failure(), thus the panic above won't happen again. Link: https://lkml.kernel.org/r/497d3835612610e370c74e697ea3c721d1d55b9c.1649775850.git.xuyu@linux.alibaba.com Fixes: 6a46079cf57a ("HWPOISON: The high level memory error handler in the VM v7") Signed-off-by: Xu Yu Reported-by: Abaci Suggested-by: Naoya Horiguchi Acked-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: Anshuman Khandual Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2020944398c9..27760c19bad7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1860,6 +1860,19 @@ try_again: } if (PageTransHuge(hpage)) { + /* + * Bail out before SetPageHasHWPoisoned() if hpage is + * huge_zero_page, although PG_has_hwpoisoned is not + * checked in set_huge_zero_page(). + * + * TODO: Handle memory failure of huge_zero_page thoroughly. + */ + if (is_huge_zero_page(hpage)) { + action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); + res = -EBUSY; + goto unlock_mutex; + } + /* * The flag must be set after the refcount is bumped * otherwise it may race with THP split. -- cgit v1.2.3-59-g8ed1b From 9b3016154c913b2e7ec5ae5c9a42eb9e732d86aa Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 21 Apr 2022 16:35:40 -0700 Subject: memcg: sync flush only if periodic flush is delayed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Daniel Dao has reported [1] a regression on workloads that may trigger a lot of refaults (anon and file). The underlying issue is that flushing rstat is expensive. Although rstat flush are batched with (nr_cpus * MEMCG_BATCH) stat updates, it seems like there are workloads which genuinely do stat updates larger than batch value within short amount of time. Since the rstat flush can happen in the performance critical codepaths like page faults, such workload can suffer greatly. This patch fixes this regression by making the rstat flushing conditional in the performance critical codepaths. More specifically, the kernel relies on the async periodic rstat flusher to flush the stats and only if the periodic flusher is delayed by more than twice the amount of its normal time window then the kernel allows rstat flushing from the performance critical codepaths. Now the question: what are the side-effects of this change? The worst that can happen is the refault codepath will see 4sec old lruvec stats and may cause false (or missed) activations of the refaulted page which may under-or-overestimate the workingset size. Though that is not very concerning as the kernel can already miss or do false activations. There are two more codepaths whose flushing behavior is not changed by this patch and we may need to come to them in future. One is the writeback stats used by dirty throttling and second is the deactivation heuristic in the reclaim. For now keeping an eye on them and if there is report of regression due to these codepaths, we will reevaluate then. Link: https://lore.kernel.org/all/CA+wXwBSyO87ZX5PVwdHm-=dBjZYECGmfnydUicUyrQqndgX2MQ@mail.gmail.com [1] Link: https://lkml.kernel.org/r/20220304184040.1304781-1-shakeelb@google.com Fixes: 1f828223b799 ("memcg: flush lruvec stats in the refault") Signed-off-by: Shakeel Butt Reported-by: Daniel Dao Tested-by: Ivan Babrou Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Michal Koutný Cc: Frank Hofmann Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++++ mm/memcontrol.c | 12 +++++++++++- mm/workingset.c | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a68dce3873fc..89b14729d59f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1012,6 +1012,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, } void mem_cgroup_flush_stats(void); +void mem_cgroup_flush_stats_delayed(void); void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); @@ -1455,6 +1456,10 @@ static inline void mem_cgroup_flush_stats(void) { } +static inline void mem_cgroup_flush_stats_delayed(void) +{ +} + static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 725f76723220..598fece89e2b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -587,6 +587,9 @@ static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); static DEFINE_SPINLOCK(stats_flush_lock); static DEFINE_PER_CPU(unsigned int, stats_updates); static atomic_t stats_flush_threshold = ATOMIC_INIT(0); +static u64 flush_next_time; + +#define FLUSH_TIME (2UL*HZ) /* * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can @@ -637,6 +640,7 @@ static void __mem_cgroup_flush_stats(void) if (!spin_trylock_irqsave(&stats_flush_lock, flag)) return; + flush_next_time = jiffies_64 + 2*FLUSH_TIME; cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); atomic_set(&stats_flush_threshold, 0); spin_unlock_irqrestore(&stats_flush_lock, flag); @@ -648,10 +652,16 @@ void mem_cgroup_flush_stats(void) __mem_cgroup_flush_stats(); } +void mem_cgroup_flush_stats_delayed(void) +{ + if (time_after64(jiffies_64, flush_next_time)) + mem_cgroup_flush_stats(); +} + static void flush_memcg_stats_dwork(struct work_struct *w) { __mem_cgroup_flush_stats(); - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } /** diff --git a/mm/workingset.c b/mm/workingset.c index 8a3828acc0bf..592569a8974c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -355,7 +355,7 @@ void workingset_refault(struct folio *folio, void *shadow) mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats_delayed(); /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if -- cgit v1.2.3-59-g8ed1b From 0e88904cb700a9654c9f0d9ca4967e761e7c9ee8 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 21 Apr 2022 16:35:43 -0700 Subject: userfaultfd: mark uffd_wp regardless of VM_WRITE flag When a PTE is set by UFFD operations such as UFFDIO_COPY, the PTE is currently only marked as write-protected if the VMA has VM_WRITE flag set. This seems incorrect or at least would be unexpected by the users. Consider the following sequence of operations that are being performed on a certain page: mprotect(PROT_READ) UFFDIO_COPY(UFFDIO_COPY_MODE_WP) mprotect(PROT_READ|PROT_WRITE) At this point the user would expect to still get UFFD notification when the page is accessed for write, but the user would not get one, since the PTE was not marked as UFFD_WP during UFFDIO_COPY. Fix it by always marking PTEs as UFFD_WP regardless on the write-permission in the VMA flags. Link: https://lkml.kernel.org/r/20220217211602.2769-1-namit@vmware.com Fixes: 292924b26024 ("userfaultfd: wp: apply _PAGE_UFFD_WP bit") Signed-off-by: Nadav Amit Acked-by: Peter Xu Cc: Axel Rasmussen Cc: Mike Rapoport Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/userfaultfd.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0cb8e5ef1713..e9bb6db002aa 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -72,12 +72,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, _dst_pte = pte_mkdirty(_dst_pte); if (page_in_cache && !vm_shared) writable = false; - if (writable) { - if (wp_copy) - _dst_pte = pte_mkuffd_wp(_dst_pte); - else - _dst_pte = pte_mkwrite(_dst_pte); - } + + /* + * Always mark a PTE as write-protected when needed, regardless of + * VM_WRITE, which the user might change. + */ + if (wp_copy) + _dst_pte = pte_mkuffd_wp(_dst_pte); + else if (writable) + _dst_pte = pte_mkwrite(_dst_pte); dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); -- cgit v1.2.3-59-g8ed1b From 5f24d5a579d1eace79d505b148808a850b417d4c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Apr 2022 16:35:46 -0700 Subject: mm, hugetlb: allow for "high" userspace addresses This is a fix for commit f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses") for hugetlb. This patch adds support for "high" userspace addresses that are optionally supported on the system and have to be requested via a hint mechanism ("high" addr parameter to mmap). Architectures such as powerpc and x86 achieve this by making changes to their architectural versions of hugetlb_get_unmapped_area() function. However, arm64 uses the generic version of that function. So take into account arch_get_mmap_base() and arch_get_mmap_end() in hugetlb_get_unmapped_area(). To allow that, move those two macros out of mm/mmap.c into include/linux/sched/mm.h If these macros are not defined in architectural code then they default to (TASK_SIZE) and (base) so should not introduce any behavioural changes to architectures that do not define them. For the time being, only ARM64 is affected by this change. Catalin (ARM64) said "We should have fixed hugetlb_get_unmapped_area() as well when we added support for 52-bit VA. The reason for commit f6795053dac8 was to prevent normal mmap() from returning addresses above 48-bit by default as some user-space had hard assumptions about this. It's a slight ABI change if you do this for hugetlb_get_unmapped_area() but I doubt anyone would notice. It's more likely that the current behaviour would cause issues, so I'd rather have them consistent. Basically when arm64 gained support for 52-bit addresses we did not want user-space calling mmap() to suddenly get such high addresses, otherwise we could have inadvertently broken some programs (similar behaviour to x86 here). Hence we added commit f6795053dac8. But we missed hugetlbfs which could still get such high mmap() addresses. So in theory that's a potential regression that should have bee addressed at the same time as commit f6795053dac8 (and before arm64 enabled 52-bit addresses)" Link: https://lkml.kernel.org/r/ab847b6edb197bffdfe189e70fb4ac76bfe79e0d.1650033747.git.christophe.leroy@csgroup.eu Fixes: f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses") Signed-off-by: Christophe Leroy Reviewed-by: Catalin Marinas Cc: Steve Capper Cc: Will Deacon Cc: [5.0.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 9 +++++---- include/linux/sched/mm.h | 8 ++++++++ mm/mmap.c | 8 -------- 3 files changed, 13 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 99c7477cee5c..dd3a088db11d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); @@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = current->mm->mmap_base; + info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); @@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); addr = vm_unmapped_area(&info); } @@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); + const unsigned long mmap_end = arch_get_mmap_end(addr); if (len & ~huge_page_mask(h)) return -EINVAL; @@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (mmap_end - len >= addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index a80356e9dc69..1ad1f4bfa025 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -136,6 +136,14 @@ static inline void mm_update_next_owner(struct mm_struct *mm) #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MMU +#ifndef arch_get_mmap_end +#define arch_get_mmap_end(addr) (TASK_SIZE) +#endif + +#ifndef arch_get_mmap_base +#define arch_get_mmap_base(addr, base) (base) +#endif + extern void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack); extern unsigned long diff --git a/mm/mmap.c b/mm/mmap.c index 3aa839f81e63..313b57d55a63 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2117,14 +2117,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) return addr; } -#ifndef arch_get_mmap_end -#define arch_get_mmap_end(addr) (TASK_SIZE) -#endif - -#ifndef arch_get_mmap_base -#define arch_get_mmap_base(addr, base) (base) -#endif - /* Get an address range which is currently unmapped. * For shmat() with addr=0. * -- cgit v1.2.3-59-g8ed1b From e4a38402c36e42df28eb1a5394be87e6571fb48a Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Thu, 21 Apr 2022 16:36:01 -0700 Subject: oom_kill.c: futex: delay the OOM reaper to allow time for proper futex cleanup The pthread struct is allocated on PRIVATE|ANONYMOUS memory [1] which can be targeted by the oom reaper. This mapping is used to store the futex robust list head; the kernel does not keep a copy of the robust list and instead references a userspace address to maintain the robustness during a process death. A race can occur between exit_mm and the oom reaper that allows the oom reaper to free the memory of the futex robust list before the exit path has handled the futex death: CPU1 CPU2 -------------------------------------------------------------------- page_fault do_exit "signal" wake_oom_reaper oom_reaper oom_reap_task_mm (invalidates mm) exit_mm exit_mm_release futex_exit_release futex_cleanup exit_robust_list get_user (EFAULT- can't access memory) If the get_user EFAULT's, the kernel will be unable to recover the waiters on the robust_list, leaving userspace mutexes hung indefinitely. Delay the OOM reaper, allowing more time for the exit path to perform the futex cleanup. Reproducer: https://gitlab.com/jsavitz/oom_futex_reproducer Based on a patch by Michal Hocko. Link: https://elixir.bootlin.com/glibc/glibc-2.35/source/nptl/allocatestack.c#L370 [1] Link: https://lkml.kernel.org/r/20220414144042.677008-1-npache@redhat.com Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") Signed-off-by: Joel Savitz Signed-off-by: Nico Pache Co-developed-by: Joel Savitz Suggested-by: Thomas Gleixner Acked-by: Thomas Gleixner Acked-by: Michal Hocko Cc: Rafael Aquini Cc: Waiman Long Cc: Herton R. Krzesinski Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Daniel Bristot de Oliveira Cc: David Rientjes Cc: Andrea Arcangeli Cc: Davidlohr Bueso Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Joel Savitz Cc: Darren Hart Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + mm/oom_kill.c | 54 ++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 41 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/include/linux/sched.h b/include/linux/sched.h index d5e3c00b74e1..a8911b1f35aa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1443,6 +1443,7 @@ struct task_struct { int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; + struct timer_list oom_reaper_timer; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7ec38194f8e1..49d7df39b02d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -632,7 +632,7 @@ done: */ set_bit(MMF_OOM_SKIP, &mm->flags); - /* Drop a reference taken by wake_oom_reaper */ + /* Drop a reference taken by queue_oom_reaper */ put_task_struct(tsk); } @@ -644,12 +644,12 @@ static int oom_reaper(void *unused) struct task_struct *tsk = NULL; wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); - spin_lock(&oom_reaper_lock); + spin_lock_irq(&oom_reaper_lock); if (oom_reaper_list != NULL) { tsk = oom_reaper_list; oom_reaper_list = tsk->oom_reaper_list; } - spin_unlock(&oom_reaper_lock); + spin_unlock_irq(&oom_reaper_lock); if (tsk) oom_reap_task(tsk); @@ -658,22 +658,48 @@ static int oom_reaper(void *unused) return 0; } -static void wake_oom_reaper(struct task_struct *tsk) +static void wake_oom_reaper(struct timer_list *timer) { - /* mm is already queued? */ - if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) - return; + struct task_struct *tsk = container_of(timer, struct task_struct, + oom_reaper_timer); + struct mm_struct *mm = tsk->signal->oom_mm; + unsigned long flags; - get_task_struct(tsk); + /* The victim managed to terminate on its own - see exit_mmap */ + if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + put_task_struct(tsk); + return; + } - spin_lock(&oom_reaper_lock); + spin_lock_irqsave(&oom_reaper_lock, flags); tsk->oom_reaper_list = oom_reaper_list; oom_reaper_list = tsk; - spin_unlock(&oom_reaper_lock); + spin_unlock_irqrestore(&oom_reaper_lock, flags); trace_wake_reaper(tsk->pid); wake_up(&oom_reaper_wait); } +/* + * Give the OOM victim time to exit naturally before invoking the oom_reaping. + * The timers timeout is arbitrary... the longer it is, the longer the worst + * case scenario for the OOM can take. If it is too small, the oom_reaper can + * get in the way and release resources needed by the process exit path. + * e.g. The futex robust list can sit in Anon|Private memory that gets reaped + * before the exit path is able to wake the futex waiters. + */ +#define OOM_REAPER_DELAY (2*HZ) +static void queue_oom_reaper(struct task_struct *tsk) +{ + /* mm is already queued? */ + if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) + return; + + get_task_struct(tsk); + timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0); + tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY; + add_timer(&tsk->oom_reaper_timer); +} + static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); @@ -681,7 +707,7 @@ static int __init oom_init(void) } subsys_initcall(oom_init) #else -static inline void wake_oom_reaper(struct task_struct *tsk) +static inline void queue_oom_reaper(struct task_struct *tsk) { } #endif /* CONFIG_MMU */ @@ -932,7 +958,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) rcu_read_unlock(); if (can_oom_reap) - wake_oom_reaper(victim); + queue_oom_reaper(victim); mmdrop(mm); put_task_struct(victim); @@ -968,7 +994,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) task_lock(victim); if (task_will_free_mem(victim)) { mark_oom_victim(victim); - wake_oom_reaper(victim); + queue_oom_reaper(victim); task_unlock(victim); put_task_struct(victim); return; @@ -1067,7 +1093,7 @@ bool out_of_memory(struct oom_control *oc) */ if (task_will_free_mem(current)) { mark_oom_victim(current); - wake_oom_reaper(current); + queue_oom_reaper(current); return true; } -- cgit v1.2.3-59-g8ed1b From 319561669a59d8e9206ab311ae5433ef92fd79d1 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Thu, 21 Apr 2022 16:36:10 -0700 Subject: mm/mmu_notifier.c: fix race in mmu_interval_notifier_remove() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some cases it is possible for mmu_interval_notifier_remove() to race with mn_tree_inv_end() allowing it to return while the notifier data structure is still in use. Consider the following sequence: CPU0 - mn_tree_inv_end() CPU1 - mmu_interval_notifier_remove() ----------------------------------- ------------------------------------ spin_lock(subscriptions->lock); seq = subscriptions->invalidate_seq; spin_lock(subscriptions->lock); spin_unlock(subscriptions->lock); subscriptions->invalidate_seq++; wait_event(invalidate_seq != seq); return; interval_tree_remove(interval_sub); kfree(interval_sub); spin_unlock(subscriptions->lock); wake_up_all(); As the wait_event() condition is true it will return immediately. This can lead to use-after-free type errors if the caller frees the data structure containing the interval notifier subscription while it is still on a deferred list. Fix this by taking the appropriate lock when reading invalidate_seq to ensure proper synchronisation. I observed this whilst running stress testing during some development. You do have to be pretty unlucky, but it leads to the usual problems of use-after-free (memory corruption, kernel crash, difficult to diagnose WARN_ON, etc). Link: https://lkml.kernel.org/r/20220420043734.476348-1-apopple@nvidia.com Fixes: 99cb252f5e68 ("mm/mmu_notifier: add an interval tree notifier") Signed-off-by: Alistair Popple Signed-off-by: Jason Gunthorpe Cc: Christian König Cc: John Hubbard Cc: Ralph Campbell Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_notifier.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 459d195d2ff6..f45ff1b7626a 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -1036,6 +1036,18 @@ int mmu_interval_notifier_insert_locked( } EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); +static bool +mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions, + unsigned long seq) +{ + bool ret; + + spin_lock(&subscriptions->lock); + ret = subscriptions->invalidate_seq != seq; + spin_unlock(&subscriptions->lock); + return ret; +} + /** * mmu_interval_notifier_remove - Remove a interval notifier * @interval_sub: Interval subscription to unregister @@ -1083,7 +1095,7 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub) lock_map_release(&__mmu_notifier_invalidate_range_start_map); if (seq) wait_event(subscriptions->wq, - READ_ONCE(subscriptions->invalidate_seq) != seq); + mmu_interval_seq_released(subscriptions, seq)); /* pairs with mmgrab in mmu_interval_notifier_insert() */ mmdrop(mm); -- cgit v1.2.3-59-g8ed1b