From 73f37dbcfe1763ee2294c7717a1f571e27d17fd8 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 10 Jun 2022 10:38:12 -0700 Subject: mm: userfaultfd: fix UFFDIO_CONTINUE on fallocated shmem pages When fallocate() is used on a shmem file, the pages we allocate can end up with !PageUptodate. Since UFFDIO_CONTINUE tries to find the existing page the user wants to map with SGP_READ, we would fail to find such a page, since shmem_getpage_gfp returns with a "NULL" pagep for SGP_READ if it discovers !PageUptodate. As a result, UFFDIO_CONTINUE returns -EFAULT, as it would do if the page wasn't found in the page cache at all. This isn't the intended behavior. UFFDIO_CONTINUE is just trying to find if a page exists, and doesn't care whether it still needs to be cleared or not. So, instead of SGP_READ, pass in SGP_NOALLOC. This is the same, except for one critical difference: in the !PageUptodate case, SGP_NOALLOC will clear the page and then return it. With this change, UFFDIO_CONTINUE works properly (succeeds) on a shmem file which has been fallocated, but otherwise not modified. Link: https://lkml.kernel.org/r/20220610173812.1768919-1-axelrasmussen@google.com Fixes: 153132571f02 ("userfaultfd/shmem: support UFFDIO_CONTINUE for shmem") Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 4f4892a5f767..07d3befc80e4 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -246,7 +246,10 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, struct page *page; int ret; - ret = shmem_getpage(inode, pgoff, &page, SGP_READ); + ret = shmem_getpage(inode, pgoff, &page, SGP_NOALLOC); + /* Our caller expects us to return -EFAULT if we failed to find page. */ + if (ret == -ENOENT) + ret = -EFAULT; if (ret) goto out; if (!page) { -- cgit v1.2.3-59-g8ed1b From ed1523a895ffdabcab6e067af18685ed00f5ce15 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 20 Jun 2022 10:34:42 +0800 Subject: mm/damon: use set_huge_pte_at() to make huge pte old The huge_ptep_set_access_flags() can not make the huge pte old according to the discussion [1], that means we will always mornitor the young state of the hugetlb though we stopped accessing the hugetlb, as a result DAMON will get inaccurate accessing statistics. So changing to use set_huge_pte_at() to make the huge pte old to fix this issue. [1] https://lore.kernel.org/all/Yqy97gXI4Nqb7dYo@arm.com/ Link: https://lkml.kernel.org/r/1655692482-28797-1-git-send-email-baolin.wang@linux.alibaba.com Fixes: 49f4203aae06 ("mm/damon: add access checking for hugetlb pages") Signed-off-by: Baolin Wang Reviewed-by: SeongJae Park Acked-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 59e1653799f8..3c7b9d6dca95 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -336,8 +336,7 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, if (pte_young(entry)) { referenced = true; entry = pte_mkold(entry); - huge_ptep_set_access_flags(vma, addr, pte, entry, - vma->vm_flags & VM_WRITE); + set_huge_pte_at(mm, addr, pte, entry); } #ifdef CONFIG_MMU_NOTIFIER -- cgit v1.2.3-59-g8ed1b From 39d35edee4537487e5178f258e23518272a66413 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 20 Jun 2022 10:30:19 +0800 Subject: mm: sparsemem: fix missing higher order allocation splitting Higher order allocations for vmemmap pages from buddy allocator must be able to be treated as indepdenent small pages as they can be freed individually by the caller. There is no problem for higher order vmemmap pages allocated at boot time since each individual small page will be initialized at boot time. However, it will be an issue for memory hotplug case since those higher order vmemmap pages are allocated from buddy allocator without initializing each individual small page's refcount. The system will panic in put_page_testzero() when CONFIG_DEBUG_VM is enabled if the vmemmap page is freed. Link: https://lkml.kernel.org/r/20220620023019.94257-1-songmuchun@bytedance.com Fixes: d8d55f5616cf ("mm: sparsemem: use page table lock to protect kernel pmd operations") Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Xiongchun Duan Cc: Signed-off-by: Andrew Morton --- mm/sparse-vmemmap.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index f4fa61dbbee3..dbbd1a7e65f3 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -78,6 +78,14 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) spin_lock(&init_mm.page_table_lock); if (likely(pmd_leaf(*pmd))) { + /* + * Higher order allocations from buddy allocator must be able to + * be treated as indepdenent small pages (as they can be freed + * individually). + */ + if (!PageReserved(page)) + split_page(page, get_order(PMD_SIZE)); + /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); -- cgit v1.2.3-59-g8ed1b From 1118234e4bc22ff50e9eae40ad95b17a6b12cefa Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 23 Jun 2022 22:53:32 +0200 Subject: mm/rmap: fix dereferencing invalid subpage pointer in try_to_migrate_one() The subpage we calculate is an invalid pointer for device private pages, because device private pages are mapped via non-present device private entries, not ordinary present PTEs. Let's just not compute broken pointers and fixup later. Move the proper assignment of the correct subpage to the beginning of the function and assert that we really only have a single page in our folio. This currently results in a BUG when tying to compute anon_exclusive, because: [ 528.727237] BUG: unable to handle page fault for address: ffffea1fffffffc0 [ 528.739585] #PF: supervisor read access in kernel mode [ 528.745324] #PF: error_code(0x0000) - not-present page [ 528.751062] PGD 44eaf2067 P4D 44eaf2067 PUD 0 [ 528.756026] Oops: 0000 [#1] PREEMPT SMP NOPTI [ 528.760890] CPU: 120 PID: 18275 Comm: hmm-tests Not tainted 5.19.0-rc3-kfd-alex #257 [ 528.769542] Hardware name: AMD Corporation BardPeak/BardPeak, BIOS RTY1002BDS 09/17/2021 [ 528.778579] RIP: 0010:try_to_migrate_one+0x21a/0x1000 [ 528.784225] Code: f6 48 89 c8 48 2b 05 45 d1 6a 01 48 c1 f8 06 48 29 c3 48 8b 45 a8 48 c1 e3 06 48 01 cb f6 41 18 01 48 89 85 50 ff ff ff 74 0b <4c> 8b 33 49 c1 ee 11 41 83 e6 01 48 8b bd 48 ff ff ff e8 3f 99 02 [ 528.805194] RSP: 0000:ffffc90003cdfaa0 EFLAGS: 00010202 [ 528.811027] RAX: 00007ffff7ff4000 RBX: ffffea1fffffffc0 RCX: ffffeaffffffffc0 [ 528.818995] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffc90003cdfaf8 [ 528.826962] RBP: ffffc90003cdfb70 R08: 0000000000000000 R09: 0000000000000000 [ 528.834930] R10: ffffc90003cdf910 R11: 0000000000000002 R12: ffff888194450540 [ 528.842899] R13: ffff888160d057c0 R14: 0000000000000000 R15: 03ffffffffffffff [ 528.850865] FS: 00007ffff7fdb740(0000) GS:ffff8883b0600000(0000) knlGS:0000000000000000 [ 528.859891] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 528.866308] CR2: ffffea1fffffffc0 CR3: 00000001562b4003 CR4: 0000000000770ee0 [ 528.874275] PKRU: 55555554 [ 528.877286] Call Trace: [ 528.880016] [ 528.882356] ? lock_is_held_type+0xdf/0x130 [ 528.887033] rmap_walk_anon+0x167/0x410 [ 528.891316] try_to_migrate+0x90/0xd0 [ 528.895405] ? try_to_unmap_one+0xe10/0xe10 [ 528.900074] ? anon_vma_ctor+0x50/0x50 [ 528.904260] ? put_anon_vma+0x10/0x10 [ 528.908347] ? invalid_mkclean_vma+0x20/0x20 [ 528.913114] migrate_vma_setup+0x5f4/0x750 [ 528.917691] dmirror_devmem_fault+0x8c/0x250 [test_hmm] [ 528.923532] do_swap_page+0xac0/0xe50 [ 528.927623] ? __lock_acquire+0x4b2/0x1ac0 [ 528.932199] __handle_mm_fault+0x949/0x1440 [ 528.936876] handle_mm_fault+0x13f/0x3e0 [ 528.941256] do_user_addr_fault+0x215/0x740 [ 528.945928] exc_page_fault+0x75/0x280 [ 528.950115] asm_exc_page_fault+0x27/0x30 [ 528.954593] RIP: 0033:0x40366b ... Link: https://lkml.kernel.org/r/20220623205332.319257-1-david@redhat.com Fixes: 6c287605fd56 ("mm: remember exclusively mapped anonymous pages with PG_anon_exclusive") Signed-off-by: David Hildenbrand Reported-by: "Sierra Guiza, Alejandro (Alex)" Reviewed-by: Alistair Popple Tested-by: Alistair Popple Cc: Vlastimil Babka Cc: Christoph Hellwig Cc: "Matthew Wilcox (Oracle)" Signed-off-by: Andrew Morton --- mm/rmap.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 5bcb334cd6f2..746c05acad27 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1899,8 +1899,23 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); - subpage = folio_page(folio, - pte_pfn(*pvmw.pte) - folio_pfn(folio)); + if (folio_is_zone_device(folio)) { + /* + * Our PTE is a non-present device exclusive entry and + * calculating the subpage as for the common case would + * result in an invalid pointer. + * + * Since only PAGE_SIZE pages can currently be + * migrated, just set it to page. This will need to be + * changed when hugepage migrations to device private + * memory are supported. + */ + VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio); + subpage = &folio->page; + } else { + subpage = folio_page(folio, + pte_pfn(*pvmw.pte) - folio_pfn(folio)); + } address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); @@ -1993,15 +2008,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* * No need to invalidate here it will synchronize on * against the special swap migration pte. - * - * The assignment to subpage above was computed from a - * swap PTE which results in an invalid pointer. - * Since only PAGE_SIZE pages can currently be - * migrated, just set it to page. This will need to be - * changed when hugepage migrations to device private - * memory are supported. */ - subpage = &folio->page; } else if (PageHWPoison(subpage)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { -- cgit v1.2.3-59-g8ed1b From 14c99d65941538aa33edd8dc7b1bbbb593c324a2 Mon Sep 17 00:00:00 2001 From: "Gowans, James" Date: Thu, 23 Jun 2022 05:24:03 +0000 Subject: mm: split huge PUD on wp_huge_pud fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the implementation will split the PUD when a fallback is taken inside the create_huge_pud function. This isn't where it should be done: the splitting should be done in wp_huge_pud, just like it's done for PMDs. Reason being that if a callback is taken during create, there is no PUD yet so nothing to split, whereas if a fallback is taken when encountering a write protection fault there is something to split. It looks like this was the original intention with the commit where the splitting was introduced, but somehow it got moved to the wrong place between v1 and v2 of the patch series. Rebase mistake perhaps. Link: https://lkml.kernel.org/r/6f48d622eb8bce1ae5dd75327b0b73894a2ec407.camel@amazon.com Fixes: 327e9fd48972 ("mm: Split huge pages on write-notify or COW") Signed-off-by: James Gowans Reviewed-by: Thomas Hellström Cc: Christian König Cc: Jan H. Schönherr Signed-off-by: Andrew Morton --- mm/memory.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 7a089145cad4..4cf7d4b6c950 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4798,6 +4798,19 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) static vm_fault_t create_huge_pud(struct vm_fault *vmf) { +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + /* No support for anonymous transparent PUD pages yet */ + if (vma_is_anonymous(vmf->vma)) + return VM_FAULT_FALLBACK; + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + return VM_FAULT_FALLBACK; +} + +static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) +{ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) /* No support for anonymous transparent PUD pages yet */ @@ -4812,19 +4825,7 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf) split: /* COW or write-notify not handled on PUD level: split pud.*/ __split_huge_pud(vmf->vma, vmf->pud, vmf->address); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - return VM_FAULT_FALLBACK; -} - -static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* No support for anonymous transparent PUD pages yet */ - if (vma_is_anonymous(vmf->vma)) - return VM_FAULT_FALLBACK; - if (vmf->vma->vm_ops->huge_fault) - return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ return VM_FAULT_FALLBACK; } -- cgit v1.2.3-59-g8ed1b