From 30af24facf0aed12dec23bdf6eac6a907f88306a Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Thu, 21 Mar 2024 16:58:18 -0700 Subject: userfaultfd: fix deadlock warning when locking src and dst VMAs Use down_read_nested() to avoid the warning. Link: https://lkml.kernel.org/r/20240321235818.125118-1-lokeshgidra@google.com Fixes: 867a43a34ff8 ("userfaultfd: use per-vma locks in userfaultfd operations") Reported-by: syzbot+49056626fe41e01f2ba7@syzkaller.appspotmail.com Signed-off-by: Lokesh Gidra Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Brian Geffon Cc: David Hildenbrand Cc: Hillf Danton Cc: Jann Horn [Bug #2] Cc: Kalesh Singh Cc: Lokesh Gidra Cc: Mike Rapoport (IBM) Cc: Nicolas Geoffray Cc: Peter Xu Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/userfaultfd.c') diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 712160cd41ec..3c3539c573e7 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1444,7 +1444,8 @@ static int uffd_move_lock(struct mm_struct *mm, */ down_read(&(*dst_vmap)->vm_lock->lock); if (*dst_vmap != *src_vmap) - down_read(&(*src_vmap)->vm_lock->lock); + down_read_nested(&(*src_vmap)->vm_lock->lock, + SINGLE_DEPTH_NESTING); } mmap_read_unlock(mm); return err; -- cgit v1.2.3-59-g8ed1b From 90a7592da14951bd21f74a53246ba30955a648aa Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 11 Apr 2024 18:14:40 +0200 Subject: mm/userfaultfd: Do not place zeropages when zeropages are disallowed s390x must disable shared zeropages for processes running VMs, because the VMs could end up making use of "storage keys" or protected virtualization, which are incompatible with shared zeropages. Yet, with userfaultfd it is possible to insert shared zeropages into such processes. Let's fallback to simply allocating a fresh zeroed anonymous folio and insert that instead. mm_forbids_zeropage() was introduced in commit 593befa6ab74 ("mm: introduce mm_forbids_zeropage function"), briefly before userfaultfd went upstream. Note that we don't want to fail the UFFDIO_ZEROPAGE request like we do for hugetlb, it would be rather unexpected. Further, we also cannot really indicated "not supported" to user space ahead of time: it could be that the MM disallows zeropages after userfaultfd was already registered. [ agordeev: Fixed checkpatch complaints ] Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation") Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240411161441.910170-2-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Alexander Gordeev --- mm/userfaultfd.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'mm/userfaultfd.c') diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 3c3539c573e7..829f7b1089fc 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -316,6 +316,38 @@ out_release: goto out; } +static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr) +{ + struct folio *folio; + int ret = -ENOMEM; + + folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); + if (!folio) + return ret; + + if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) + goto out_put; + + /* + * The memory barrier inside __folio_mark_uptodate makes sure that + * zeroing out the folio become visible before mapping the page + * using set_pte_at(). See do_anonymous_page(). + */ + __folio_mark_uptodate(folio); + + ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, + &folio->page, true, 0); + if (ret) + goto out_put; + + return 0; +out_put: + folio_put(folio); + return ret; +} + static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr) @@ -324,6 +356,9 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, spinlock_t *ptl; int ret; + if (mm_forbids_zeropage(dst_vma->vm_mm)) + return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); + _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), dst_vma->vm_page_prot)); ret = -EAGAIN; -- cgit v1.2.3-59-g8ed1b From 5beaee54a324ba1fe307e341ec825d5d099f4091 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 26 Mar 2024 20:28:22 +0000 Subject: mm: add is_huge_zero_folio() This is the folio equivalent of is_huge_zero_page(). It doesn't add any efficiency, but it does prevent the caller from passing a tail page and getting confused when the predicate returns false. Link: https://lkml.kernel.org/r/20240326202833.523759-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- fs/proc/page.c | 2 +- include/linux/huge_mm.h | 10 ++++++++++ mm/huge_memory.c | 6 +++--- mm/mempolicy.c | 2 +- mm/swap.c | 2 +- mm/swap_state.c | 2 +- mm/userfaultfd.c | 2 +- 7 files changed, 18 insertions(+), 8 deletions(-) (limited to 'mm/userfaultfd.c') diff --git a/fs/proc/page.c b/fs/proc/page.c index 05120263af2a..2fb64bdb64eb 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -155,7 +155,7 @@ u64 stable_page_flags(const struct page *page) else if (folio_test_large(folio)) { if ((k & (1 << PG_lru)) || is_anon) u |= 1 << KPF_THP; - else if (is_huge_zero_page(&folio->page)) { + else if (is_huge_zero_folio(folio)) { u |= 1 << KPF_ZERO_PAGE; u |= 1 << KPF_THP; } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1540a1481daf..600c6008262b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -356,6 +356,11 @@ static inline bool is_huge_zero_page(const struct page *page) return READ_ONCE(huge_zero_page) == page; } +static inline bool is_huge_zero_folio(const struct folio *folio) +{ + return READ_ONCE(huge_zero_page) == &folio->page; +} + static inline bool is_huge_zero_pmd(pmd_t pmd) { return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd); @@ -485,6 +490,11 @@ static inline bool is_huge_zero_page(const struct page *page) return false; } +static inline bool is_huge_zero_folio(const struct folio *folio) +{ + return false; +} + static inline bool is_huge_zero_pmd(pmd_t pmd) { return false; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 75ad971ca45e..5c043c7b5062 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -789,12 +789,12 @@ struct deferred_split *get_deferred_split_queue(struct folio *folio) } #endif -static inline bool is_transparent_hugepage(struct folio *folio) +static inline bool is_transparent_hugepage(const struct folio *folio) { if (!folio_test_large(folio)) return false; - return is_huge_zero_page(&folio->page) || + return is_huge_zero_folio(folio) || folio_test_large_rmappable(folio); } @@ -3085,7 +3085,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, } - is_hzp = is_huge_zero_page(&folio->page); + is_hzp = is_huge_zero_folio(folio); if (is_hzp) { pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); return -EBUSY; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 913cff5da5a3..5743028a63a5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -510,7 +510,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) return; } folio = pfn_folio(pmd_pfn(*pmd)); - if (is_huge_zero_page(&folio->page)) { + if (is_huge_zero_folio(folio)) { walk->action = ACTION_CONTINUE; return; } diff --git a/mm/swap.c b/mm/swap.c index 500a09a48dfd..f72364e92d5f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -985,7 +985,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) struct folio *folio = folios->folios[i]; unsigned int nr_refs = refs ? refs[i] : 1; - if (is_huge_zero_page(&folio->page)) + if (is_huge_zero_folio(folio)) continue; if (folio_is_zone_device(folio)) { diff --git a/mm/swap_state.c b/mm/swap_state.c index bfc7e8c58a6d..2deac23633cd 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -301,7 +301,7 @@ void free_page_and_swap_cache(struct page *page) struct folio *folio = page_folio(page); free_swap_cache(folio); - if (!is_huge_zero_page(page)) + if (!is_huge_zero_folio(folio)) folio_put(folio); } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 3c3539c573e7..a0ec14553fbe 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1664,7 +1664,7 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, !pmd_none(dst_pmdval)) { struct folio *folio = pfn_folio(pmd_pfn(*src_pmd)); - if (!folio || (!is_huge_zero_page(&folio->page) && + if (!folio || (!is_huge_zero_folio(folio) && !PageAnonExclusive(&folio->page))) { spin_unlock(ptl); err = -EBUSY; -- cgit v1.2.3-59-g8ed1b From e06d03d5590ae1c257b8aa2cfbfe6765e0755c14 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 26 Mar 2024 20:28:23 +0000 Subject: mm: add pmd_folio() Convert directly from a pmd to a folio without going through another representation first. For now this is just a slightly shorter way to write it, but it might end up being more efficient later. Link: https://lkml.kernel.org/r/20240326202833.523759-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 ++ mm/huge_memory.c | 6 +++--- mm/madvise.c | 2 +- mm/mempolicy.c | 2 +- mm/mlock.c | 2 +- mm/userfaultfd.c | 2 +- 6 files changed, 9 insertions(+), 7 deletions(-) (limited to 'mm/userfaultfd.c') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 600e17d03659..09c85c7bf9c2 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -50,6 +50,8 @@ #define pmd_pgtable(pmd) pmd_page(pmd) #endif +#define pmd_folio(pmd) page_folio(pmd_page(pmd)) + /* * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD] * diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5c043c7b5062..712263e3b1f6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1816,7 +1816,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, goto out; } - folio = pfn_folio(pmd_pfn(orig_pmd)); + folio = pmd_folio(orig_pmd); /* * If other processes are mapping this folio, we couldn't discard * the folio unless they all do MADV_FREE so let's skip the folio. @@ -2086,7 +2086,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_protnone(*pmd)) goto unlock; - folio = page_folio(pmd_page(*pmd)); + folio = pmd_folio(*pmd); toptier = node_is_toptier(folio_nid(folio)); /* * Skip scanning top tier node if normal numa @@ -2663,7 +2663,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * It's safe to call pmd_page when folio is set because it's * guaranteed that pmd is present. */ - if (folio && folio != page_folio(pmd_page(*pmd))) + if (folio && folio != pmd_folio(*pmd)) goto out; __split_huge_pmd_locked(vma, pmd, range.start, freeze); } diff --git a/mm/madvise.c b/mm/madvise.c index 7625830d6ae9..1f77a51baaac 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -363,7 +363,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, goto huge_unlock; } - folio = pfn_folio(pmd_pfn(orig_pmd)); + folio = pmd_folio(orig_pmd); /* Do not interfere with other mappings of this folio */ if (folio_likely_mapped_shared(folio)) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5743028a63a5..aec756ae5637 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -509,7 +509,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) qp->nr_failed++; return; } - folio = pfn_folio(pmd_pfn(*pmd)); + folio = pmd_folio(*pmd); if (is_huge_zero_folio(folio)) { walk->action = ACTION_CONTINUE; return; diff --git a/mm/mlock.c b/mm/mlock.c index 1ed2f2ab37cd..30b51cdea89d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -378,7 +378,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, goto out; if (is_huge_zero_pmd(*pmd)) goto out; - folio = page_folio(pmd_page(*pmd)); + folio = pmd_folio(*pmd); if (vma->vm_flags & VM_LOCKED) mlock_folio(folio); else diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index a0ec14553fbe..b70618e8dcd2 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1662,7 +1662,7 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, /* Check if we can move the pmd without splitting it. */ if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || !pmd_none(dst_pmdval)) { - struct folio *folio = pfn_folio(pmd_pfn(*src_pmd)); + struct folio *folio = pmd_folio(*src_pmd); if (!folio || (!is_huge_zero_folio(folio) && !PageAnonExclusive(&folio->page))) { -- cgit v1.2.3-59-g8ed1b From b5ba3a64279355731252098d92550e12bf9649e4 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sun, 14 Apr 2024 19:08:21 -0700 Subject: userfaultfd: remove WRITE_ONCE when setting folio->index during UFFDIO_MOVE When folio is moved with UFFDIO_MOVE it gets locked before the rmap and index are modified. Due to the folio lock being already held, WRITE_ONCE() is not needed when setting the folio index. Remove it. Link: https://lkml.kernel.org/r/20240415020821.1152951-1-surenb@google.com Reported-by: Matthew Wilcox Signed-off-by: Suren Baghdasaryan Reviewed-by: David Hildenbrand Reviewed-by: Peter Xu Cc: Lokesh Gidra Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- mm/userfaultfd.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/userfaultfd.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 264e09043f09..31b6bbffea52 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2200,7 +2200,7 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm } folio_move_anon_rmap(src_folio, dst_vma); - WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); + src_folio->index = linear_page_index(dst_vma, dst_addr); _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index b70618e8dcd2..575ccf90325a 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1026,7 +1026,7 @@ static int move_present_pte(struct mm_struct *mm, } folio_move_anon_rmap(src_folio, dst_vma); - WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); + src_folio->index = linear_page_index(dst_vma, dst_addr); orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ -- cgit v1.2.3-59-g8ed1b From a568b4126b20ebbc01914e12d083379720911799 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 23 Apr 2024 23:55:36 +0100 Subject: userfault; expand folio use in mfill_atomic_install_pte() Call page_folio() a little earlier so we can use folio_mapping() instead of page_mapping(), saving a call to compound_head(). Link: https://lkml.kernel.org/r/20240423225552.4113447-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Eric Biggers Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm/userfaultfd.c') diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 575ccf90325a..8b1005ef9dfa 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -180,9 +180,9 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, pte_t _dst_pte, *dst_pte; bool writable = dst_vma->vm_flags & VM_WRITE; bool vm_shared = dst_vma->vm_flags & VM_SHARED; - bool page_in_cache = page_mapping(page); spinlock_t *ptl; - struct folio *folio; + struct folio *folio = page_folio(page); + bool page_in_cache = folio_mapping(folio); _dst_pte = mk_pte(page, dst_vma->vm_page_prot); _dst_pte = pte_mkdirty(_dst_pte); @@ -212,7 +212,6 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, if (!pte_none_mostly(ptep_get(dst_pte))) goto out_unlock; - folio = page_folio(page); if (page_in_cache) { /* Usually, cache pages are already added to LRU */ if (newly_allocated) -- cgit v1.2.3-59-g8ed1b From 73b4a0cd8243709870701349611722ba3c351815 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 26 Apr 2024 15:45:02 +0100 Subject: mm: fix some minor per-VMA lock issues in userfaultfd Rename lock_vma() to uffd_lock_vma() because it really is uffd specific. Remove comment referencing unlock_vma() which doesn't exist. Fix the comment about lock_vma_under_rcu() which I just made incorrect. Link: https://lkml.kernel.org/r/20240426144506.1290619-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: David Hildenbrand Cc: Jann Horn Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'mm/userfaultfd.c') diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 8b1005ef9dfa..d9e82ae68244 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -56,17 +56,16 @@ struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, #ifdef CONFIG_PER_VMA_LOCK /* - * lock_vma() - Lookup and lock vma corresponding to @address. + * uffd_lock_vma() - Lookup and lock vma corresponding to @address. * @mm: mm to search vma in. * @address: address that the vma should contain. * - * Should be called without holding mmap_lock. vma should be unlocked after use - * with unlock_vma(). + * Should be called without holding mmap_lock. * * Return: A locked vma containing @address, -ENOENT if no vma is found, or * -ENOMEM if anon_vma couldn't be allocated. */ -static struct vm_area_struct *lock_vma(struct mm_struct *mm, +static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, unsigned long address) { struct vm_area_struct *vma; @@ -74,9 +73,8 @@ static struct vm_area_struct *lock_vma(struct mm_struct *mm, vma = lock_vma_under_rcu(mm, address); if (vma) { /* - * lock_vma_under_rcu() only checks anon_vma for private - * anonymous mappings. But we need to ensure it is assigned in - * private file-backed vmas as well. + * We know we're going to need to use anon_vma, so check + * that early. */ if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) vma_end_read(vma); @@ -107,7 +105,7 @@ static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, { struct vm_area_struct *dst_vma; - dst_vma = lock_vma(dst_mm, dst_start); + dst_vma = uffd_lock_vma(dst_mm, dst_start); if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) return dst_vma; @@ -1401,7 +1399,7 @@ static int uffd_move_lock(struct mm_struct *mm, struct vm_area_struct *vma; int err; - vma = lock_vma(mm, dst_start); + vma = uffd_lock_vma(mm, dst_start); if (IS_ERR(vma)) return PTR_ERR(vma); @@ -1416,7 +1414,7 @@ static int uffd_move_lock(struct mm_struct *mm, } /* - * Using lock_vma() to get src_vma can lead to following deadlock: + * Using uffd_lock_vma() to get src_vma can lead to following deadlock: * * Thread1 Thread2 * ------- ------- @@ -1438,7 +1436,7 @@ static int uffd_move_lock(struct mm_struct *mm, err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); if (!err) { /* - * See comment in lock_vma() as to why not using + * See comment in uffd_lock_vma() as to why not using * vma_start_read() here. */ down_read(&(*dst_vmap)->vm_lock->lock); -- cgit v1.2.3-59-g8ed1b