From 0129ab1f268b6cf88825eae819b9b84aa0a85634 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 24 Dec 2021 21:12:32 -0800 Subject: kfence: fix memory leak when cat kfence objects Hulk robot reported a kmemleak problem: unreferenced object 0xffff93d1d8cc02e8 (size 248): comm "cat", pid 23327, jiffies 4624670141 (age 495992.217s) hex dump (first 32 bytes): 00 40 85 19 d4 93 ff ff 00 10 00 00 00 00 00 00 .@.............. 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: seq_open+0x2a/0x80 full_proxy_open+0x167/0x1e0 do_dentry_open+0x1e1/0x3a0 path_openat+0x961/0xa20 do_filp_open+0xae/0x120 do_sys_openat2+0x216/0x2f0 do_sys_open+0x57/0x80 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 unreferenced object 0xffff93d419854000 (size 4096): comm "cat", pid 23327, jiffies 4624670141 (age 495992.217s) hex dump (first 32 bytes): 6b 66 65 6e 63 65 2d 23 32 35 30 3a 20 30 78 30 kfence-#250: 0x0 30 30 30 30 30 30 30 37 35 34 62 64 61 31 32 2d 0000000754bda12- backtrace: seq_read_iter+0x313/0x440 seq_read+0x14b/0x1a0 full_proxy_read+0x56/0x80 vfs_read+0xa5/0x1b0 ksys_read+0xa0/0xf0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 I find that we can easily reproduce this problem with the following commands: cat /sys/kernel/debug/kfence/objects echo scan > /sys/kernel/debug/kmemleak cat /sys/kernel/debug/kmemleak The leaked memory is allocated in the stack below: do_syscall_64 do_sys_open do_dentry_open full_proxy_open seq_open ---> alloc seq_file vfs_read full_proxy_read seq_read seq_read_iter traverse ---> alloc seq_buf And it should have been released in the following process: do_syscall_64 syscall_exit_to_user_mode exit_to_user_mode_prepare task_work_run ____fput __fput full_proxy_release ---> free here However, the release function corresponding to file_operations is not implemented in kfence. As a result, a memory leak occurs. Therefore, the solution to this problem is to implement the corresponding release function. Link: https://lkml.kernel.org/r/20211206133628.2822545-1-libaokun1@huawei.com Fixes: 0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure") Signed-off-by: Baokun Li Reported-by: Hulk Robot Acked-by: Marco Elver Reviewed-by: Kefeng Wang Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Yu Kuai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kfence/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 09945784df9e..a19154a8d196 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -683,6 +683,7 @@ static const struct file_operations objects_fops = { .open = open_objects, .read = seq_read, .llseek = seq_lseek, + .release = seq_release, }; static int __init kfence_debugfs_init(void) -- cgit v1.2.3-59-g8ed1b From 338635340669d5b317c7e8dcf4fff4a0f3651d87 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 24 Dec 2021 21:12:35 -0800 Subject: mm: mempolicy: fix THP allocations escaping mempolicy restrictions alloc_pages_vma() may try to allocate THP page on the local NUMA node first: page = __alloc_pages_node(hpage_node, gfp | __GFP_THISNODE | __GFP_NORETRY, order); And if the allocation fails it retries allowing remote memory: if (!page && (gfp & __GFP_DIRECT_RECLAIM)) page = __alloc_pages_node(hpage_node, gfp, order); However, this retry allocation completely ignores memory policy nodemask allowing allocation to escape restrictions. The first appearance of this bug seems to be the commit ac5b2c18911f ("mm: thp: relax __GFP_THISNODE for MADV_HUGEPAGE mappings"). The bug disappeared later in the commit 89c83fb539f9 ("mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask") and reappeared again in slightly different form in the commit 76e654cc91bb ("mm, page_alloc: allow hugepage fallback to remote nodes when madvised") Fix this by passing correct nodemask to the __alloc_pages() call. The demonstration/reproducer of the problem: $ mount -oremount,size=4G,huge=always /dev/shm/ $ echo always > /sys/kernel/mm/transparent_hugepage/defrag $ cat mbind_thp.c #include #include #include #include #include #include #include #include #define SIZE 2ULL << 30 int main(int argc, char **argv) { int fd; unsigned long long i; char *addr; pid_t pid; char buf[100]; unsigned long nodemask = 1; fd = open("/dev/shm/test", O_RDWR|O_CREAT); assert(fd > 0); assert(ftruncate(fd, SIZE) == 0); addr = mmap(NULL, SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); assert(mbind(addr, SIZE, MPOL_BIND, &nodemask, 2, MPOL_MF_STRICT|MPOL_MF_MOVE)==0); for (i = 0; i < SIZE; i+=4096) { addr[i] = 1; } pid = getpid(); snprintf(buf, sizeof(buf), "grep shm /proc/%d/numa_maps", pid); system(buf); sleep(10000); return 0; } $ gcc mbind_thp.c -o mbind_thp -lnuma $ numactl -H available: 2 nodes (0-1) node 0 cpus: 0 2 node 0 size: 1918 MB node 0 free: 1595 MB node 1 cpus: 1 3 node 1 size: 2014 MB node 1 free: 1731 MB node distances: node 0 1 0: 10 20 1: 20 10 $ rm -f /dev/shm/test; taskset -c 0 ./mbind_thp 7fd970a00000 bind:0 file=/dev/shm/test dirty=524288 active=0 N0=396800 N1=127488 kernelpagesize_kB=4 Link: https://lkml.kernel.org/r/20211208165343.22349-1-arbn@yandex-team.com Fixes: ac5b2c18911f ("mm: thp: relax __GFP_THISNODE for MADV_HUGEPAGE mappings") Signed-off-by: Andrey Ryabinin Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: David Rientjes Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 10e9c87260ed..f6248affaf38 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2140,8 +2140,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, * memory with both reclaim and compact as well. */ if (!page && (gfp & __GFP_DIRECT_RECLAIM)) - page = __alloc_pages_node(hpage_node, - gfp, order); + page = __alloc_pages(gfp, order, hpage_node, nmask); goto out; } -- cgit v1.2.3-59-g8ed1b From 71d2bcec2d4d69ff109c497e6611d6c53c8926d4 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 24 Dec 2021 21:12:39 -0800 Subject: kernel/crash_core: suppress unknown crashkernel parameter warning When booting with crashkernel= on the kernel command line a warning similar to Kernel command line: ro console=ttyS0 crashkernel=256M Unknown kernel command line parameters "crashkernel=256M", will be passed to user space. is printed. This comes from crashkernel= being parsed independent from the kernel parameter handling mechanism. So the code in init/main.c doesn't know that crashkernel= is a valid kernel parameter and prints this incorrect warning. Suppress the warning by adding a dummy early_param handler for crashkernel=. Link: https://lkml.kernel.org/r/20211208133443.6867-1-prudo@redhat.com Fixes: 86d1919a4fb0 ("init: print out unknown kernel parameters") Signed-off-by: Philipp Rudo Acked-by: Baoquan He Cc: Andrew Halaney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/crash_core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index eb53f5ec62c9..256cf6db573c 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -295,6 +296,16 @@ int __init parse_crashkernel_low(char *cmdline, "crashkernel=", suffix_tbl[SUFFIX_LOW]); } +/* + * Add a dummy early_param handler to mark crashkernel= as a known command line + * parameter and suppress incorrect warnings in init/main.c. + */ +static int __init parse_crashkernel_dummy(char *arg) +{ + return 0; +} +early_param("crashkernel", parse_crashkernel_dummy); + Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { -- cgit v1.2.3-59-g8ed1b From 7e5b901e4609441fc6bb94701c4743b39b6c277e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 24 Dec 2021 21:12:42 -0800 Subject: MAINTAINERS: mark more list instances as moderated Some lists that are moderated are not marked as moderated consistently, so mark them all as moderated. Link: https://lkml.kernel.org/r/20211209001330.18558-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Cc: Miquel Raynal Cc: Conor Culhane Cc: Ryder Lee Cc: Jianjun Wang Cc: Alexandre Belloni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 8912b2c1260c..fb18ce7168aa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14845,7 +14845,7 @@ PCIE DRIVER FOR MEDIATEK M: Ryder Lee M: Jianjun Wang L: linux-pci@vger.kernel.org -L: linux-mediatek@lists.infradead.org +L: linux-mediatek@lists.infradead.org (moderated for non-subscribers) S: Supported F: Documentation/devicetree/bindings/pci/mediatek* F: drivers/pci/controller/*mediatek* @@ -17423,7 +17423,7 @@ F: drivers/video/fbdev/sm712* SILVACO I3C DUAL-ROLE MASTER M: Miquel Raynal M: Conor Culhane -L: linux-i3c@lists.infradead.org +L: linux-i3c@lists.infradead.org (moderated for non-subscribers) S: Maintained F: Documentation/devicetree/bindings/i3c/silvaco,i3c-master.yaml F: drivers/i3c/master/svc-i3c-master.c -- cgit v1.2.3-59-g8ed1b From e37e7b0b3bd52ec4f8ab71b027bcec08f57f1b3b Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 24 Dec 2021 21:12:45 -0800 Subject: mm, hwpoison: fix condition in free hugetlb page path When a memory error hits a tail page of a free hugepage, __page_handle_poison() is expected to be called to isolate the error in 4kB unit, but it's not called due to the outdated if-condition in memory_failure_hugetlb(). This loses the chance to isolate the error in the finer unit, so it's not optimal. Drop the condition. This "(p != head && TestSetPageHWPoison(head)" condition is based on the old semantics of PageHWPoison on hugepage (where PG_hwpoison flag was set on the subpage), so it's not necessray any more. By getting to set PG_hwpoison on head page for hugepages, concurrent error events on different subpages in a single hugepage can be prevented by TestSetPageHWPoison(head) at the beginning of memory_failure_hugetlb(). So dropping the condition should not reopen the race window originally mentioned in commit b985194c8c0a ("hwpoison, hugetlb: lock_page/unlock_page does not match for handling a free hugepage") [naoya.horiguchi@linux.dev: fix "HardwareCorrupted" counter] Link: https://lkml.kernel.org/r/20211220084851.GA1460264@u2004 Link: https://lkml.kernel.org/r/20211210110208.879740-1-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reported-by: Fei Luo Reviewed-by: Mike Kravetz Cc: [5.14+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 07c875fdeaf0..682828b94ab6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1470,17 +1470,12 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) if (!(flags & MF_COUNT_INCREASED)) { res = get_hwpoison_page(p, flags); if (!res) { - /* - * Check "filter hit" and "race with other subpage." - */ lock_page(head); - if (PageHWPoison(head)) { - if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) - || (p != head && TestSetPageHWPoison(head))) { + if (hwpoison_filter(p)) { + if (TestClearPageHWPoison(head)) num_poisoned_pages_dec(); - unlock_page(head); - return 0; - } + unlock_page(head); + return 0; } unlock_page(head); res = MF_FAILED; -- cgit v1.2.3-59-g8ed1b From 94ab10dd42a70acc5208a41325617e3d9cf81a70 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 24 Dec 2021 21:12:48 -0800 Subject: mm: delete unsafe BUG from page_cache_add_speculative() It is not easily reproducible, but on 5.16-rc I have several times hit the VM_BUG_ON_PAGE(PageTail(page), page) in page_cache_add_speculative(): usually from filemap_get_read_batch() for an ext4 read, yesterday from next_uptodate_page() from filemap_map_pages() for a shmem fault. That BUG used to be placed where page_ref_add_unless() had succeeded, but now it is placed before folio_ref_add_unless() is attempted: that is not safe, since it is only the acquired reference which makes the page safe from racing THP collapse or split. We could keep the BUG, checking PageTail only when folio_ref_try_add_rcu() has succeeded; but I don't think it adds much value - just delete it. Link: https://lkml.kernel.org/r/8b98fc6f-3439-8614-c3f3-945c659a1aba@google.com Fixes: 020853b6f5ea ("mm: Add folio_try_get_rcu()") Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Reviewed-by: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: William Kucharski Cc: Christoph Hellwig Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 605246452305..d150a9082b31 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -285,7 +285,6 @@ static inline struct inode *folio_inode(struct folio *folio) static inline bool page_cache_add_speculative(struct page *page, int count) { - VM_BUG_ON_PAGE(PageTail(page), page); return folio_ref_try_add_rcu((struct folio *)page, count); } -- cgit v1.2.3-59-g8ed1b From 595ec1973c276f6c0c1de8aca5eef8dfd81f9b49 Mon Sep 17 00:00:00 2001 From: Thibaut Sautereau Date: Fri, 24 Dec 2021 21:12:51 -0800 Subject: mm/page_alloc: fix __alloc_size attribute for alloc_pages_exact_nid The second parameter of alloc_pages_exact_nid is the one indicating the size of memory pointed by the returned pointer. Link: https://lkml.kernel.org/r/YbjEgwhn4bGblp//@coeus Fixes: abd58f38dfb4 ("mm/page_alloc: add __alloc_size attributes for better bounds checking") Signed-off-by: Thibaut Sautereau Acked-by: Kees Cook Cc: Daniel Micay Cc: Levente Polyak Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b976c4177299..8fcc38467af6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -624,7 +624,7 @@ extern unsigned long get_zeroed_page(gfp_t gfp_mask); void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1); void free_pages_exact(void *virt, size_t size); -__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(1); +__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) -- cgit v1.2.3-59-g8ed1b From 34796417964b8d0aef45a99cf6c2d20cebe33733 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 24 Dec 2021 21:12:54 -0800 Subject: mm/damon/dbgfs: protect targets destructions with kdamond_lock DAMON debugfs interface iterates current monitoring targets in 'dbgfs_target_ids_read()' while holding the corresponding 'kdamond_lock'. However, it also destructs the monitoring targets in 'dbgfs_before_terminate()' without holding the lock. This can result in a use_after_free bug. This commit avoids the race by protecting the destruction with the corresponding 'kdamond_lock'. Link: https://lkml.kernel.org/r/20211221094447.2241-1-sj@kernel.org Reported-by: Sangwoo Bae Fixes: 4bc05954d007 ("mm/damon: implement a debugfs-based user space interface") Signed-off-by: SeongJae Park Cc: [5.15.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/dbgfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 1efac0022e9a..4fbd729edc9e 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -650,10 +650,12 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx) if (!targetid_is_pid(ctx)) return; + mutex_lock(&ctx->kdamond_lock); damon_for_each_target_safe(t, next, ctx) { put_pid((struct pid *)t->id); damon_destroy_target(t); } + mutex_unlock(&ctx->kdamond_lock); } static struct damon_ctx *dbgfs_new_ctx(void) -- cgit v1.2.3-59-g8ed1b From 2a57d83c78f889bf3f54eede908d0643c40d5418 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 24 Dec 2021 21:12:58 -0800 Subject: mm/hwpoison: clear MF_COUNT_INCREASED before retrying get_any_page() Hulk Robot reported a panic in put_page_testzero() when testing madvise() with MADV_SOFT_OFFLINE. The BUG() is triggered when retrying get_any_page(). This is because we keep MF_COUNT_INCREASED flag in second try but the refcnt is not increased. page dumped because: VM_BUG_ON_PAGE(page_ref_count(page) == 0) ------------[ cut here ]------------ kernel BUG at include/linux/mm.h:737! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 5 PID: 2135 Comm: sshd Tainted: G B 5.16.0-rc6-dirty #373 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 RIP: release_pages+0x53f/0x840 Call Trace: free_pages_and_swap_cache+0x64/0x80 tlb_flush_mmu+0x6f/0x220 unmap_page_range+0xe6c/0x12c0 unmap_single_vma+0x90/0x170 unmap_vmas+0xc4/0x180 exit_mmap+0xde/0x3a0 mmput+0xa3/0x250 do_exit+0x564/0x1470 do_group_exit+0x3b/0x100 __do_sys_exit_group+0x13/0x20 __x64_sys_exit_group+0x16/0x20 do_syscall_64+0x34/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Modules linked in: ---[ end trace e99579b570fe0649 ]--- RIP: 0010:release_pages+0x53f/0x840 Link: https://lkml.kernel.org/r/20211221074908.3910286-1-liushixin2@huawei.com Fixes: b94e02822deb ("mm,hwpoison: try to narrow window race for free pages") Signed-off-by: Liu Shixin Reported-by: Hulk Robot Reviewed-by: Oscar Salvador Acked-by: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 682828b94ab6..3a274468f193 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2234,6 +2234,7 @@ retry: } else if (ret == 0) { if (soft_offline_free_page(page) && try_again) { try_again = false; + flags &= ~MF_COUNT_INCREASED; goto retry; } } -- cgit v1.2.3-59-g8ed1b