From acfac37851e01b40c30a7afd0d93ad8db8914f25 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 7 Oct 2022 12:59:20 -0700 Subject: mm/hugetlb.c: make __hugetlb_vma_unlock_write_put() static Reported-by: kernel test robot Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0ad53ad98e74..41d3aa077837 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6804,7 +6804,7 @@ void hugetlb_vma_lock_release(struct kref *kref) kfree(vma_lock); } -void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) +static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) { struct vm_area_struct *vma = vma_lock->vma; -- cgit v1.2.3-59-g8ed1b From 515778e2d790652a38a24554fdb7f21420d91efc Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 30 Sep 2022 20:25:55 -0400 Subject: mm/uffd: fix warning without PTE_MARKER_UFFD_WP compiled in When PTE_MARKER_UFFD_WP not configured, it's still possible to reach pte marker code and trigger an warning. Add a few CONFIG_PTE_MARKER_UFFD_WP ifdefs to make sure the code won't be reached when not compiled in. Link: https://lkml.kernel.org/r/YzeR+R6b4bwBlBHh@x1n Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: Peter Xu Reported-by: Cc: Axel Rasmussen Cc: Brian Geffon Cc: Edward Liaw Cc: Liu Shixin Cc: Mike Kravetz Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 ++++ mm/memory.c | 2 ++ mm/mprotect.c | 2 ++ 3 files changed, 8 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 41d3aa077837..9a910612336d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5096,6 +5096,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct * unmapped and its refcount is dropped, so just clear pte here. */ if (unlikely(!pte_present(pte))) { +#ifdef CONFIG_PTE_MARKER_UFFD_WP /* * If the pte was wr-protected by uffd-wp in any of the * swap forms, meanwhile the caller does not want to @@ -5107,6 +5108,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP)); else +#endif huge_pte_clear(mm, address, ptep, sz); spin_unlock(ptl); continue; @@ -5135,11 +5137,13 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); +#ifdef CONFIG_PTE_MARKER_UFFD_WP /* Leave a uffd-wp pte marker if needed */ if (huge_pte_uffd_wp(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP)); +#endif hugetlb_count_sub(pages_per_huge_page(h), mm); page_remove_rmap(page, vma, true); diff --git a/mm/memory.c b/mm/memory.c index df678fa30cdb..2c7723ea4371 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1393,10 +1393,12 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, struct zap_details *details, pte_t pteval) { +#ifdef CONFIG_PTE_MARKER_UFFD_WP if (zap_drop_file_uffd_wp(details)) return; pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); +#endif } static unsigned long zap_pte_range(struct mmu_gather *tlb, diff --git a/mm/mprotect.c b/mm/mprotect.c index 461dcbd4f21a..668bfaa6ed2a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -267,6 +267,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, } else { /* It must be an none page, or what else?.. */ WARN_ON_ONCE(!pte_none(oldpte)); +#ifdef CONFIG_PTE_MARKER_UFFD_WP if (unlikely(uffd_wp && !vma_is_anonymous(vma))) { /* * For file-backed mem, we need to be able to @@ -278,6 +279,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, make_pte_marker(PTE_MARKER_UFFD_WP)); pages++; } +#endif } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); -- cgit v1.2.3-59-g8ed1b From 2ea7ff1e39cbe3753d3c649beb70f2cf861dca75 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 Oct 2022 15:33:58 -0400 Subject: mm/hugetlb: fix race condition of uffd missing/minor handling Patch series "mm/hugetlb: Fix selftest failures with write check", v3. Currently akpm mm-unstable fails with uffd hugetlb private mapping test randomly on a write check. The initial bisection of that points to the recent pmd unshare series, but it turns out there's no direction relationship with the series but only some timing change caused the race to start trigger. The race should be fixed in patch 1. Patch 2 is a trivial cleanup on the similar race with hugetlb migrations, patch 3 comment on the write check so when anyone read it again it'll be clear why it's there. This patch (of 3): After the recent rework patchset of hugetlb locking on pmd sharing, kselftest for userfaultfd sometimes fails on hugetlb private tests with unexpected write fault checks. It turns out there's nothing wrong within the locking series regarding this matter, but it could have changed the timing of threads so it can trigger an old bug. The real bug is when we call hugetlb_no_page() we're not with the pgtable lock. It means we're reading the pte values lockless. It's perfectly fine in most cases because before we do normal page allocations we'll take the lock and check pte_same() again. However before that, there are actually two paths on userfaultfd missing/minor handling that may directly move on with the fault process without checking the pte values. It means for these two paths we may be generating an uffd message based on an unstable pte, while an unstable pte can legally be anything as long as the modifier holds the pgtable lock. One example, which is also what happened in the failing kselftest and caused the test failure, is that for private mappings wr-protection changes can happen on one page. While hugetlb_change_protection() generally requires pte being cleared before being changed, then there can be a race condition like: thread 1 thread 2 -------- -------- UFFDIO_WRITEPROTECT hugetlb_fault hugetlb_change_protection pgtable_lock() huge_ptep_modify_prot_start pte==NULL hugetlb_no_page generate uffd missing event even if page existed!! huge_ptep_modify_prot_commit pgtable_unlock() Fix this by rechecking the pte after pgtable lock for both userfaultfd missing & minor fault paths. This bug should have been around starting from uffd hugetlb introduced, so attaching a Fixes to the commit. Also attach another Fixes to the minor support commit for easier tracking. Note that userfaultfd is actually fine with false positives (e.g. caused by pte changed), but not wrong logical events (e.g. caused by reading a pte during changing). The latter can confuse the userspace, so the strictness is very much preferred. E.g., MISSING event should never happen on the page after UFFDIO_COPY has correctly installed the page and returned. Link: https://lkml.kernel.org/r/20221004193400.110155-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20221004193400.110155-2-peterx@redhat.com Fixes: 1a1aad8a9b7b ("userfaultfd: hugetlbfs: add userfaultfd hugetlb hook") Fixes: 7677f7fd8be7 ("userfaultfd: add minor fault registration mode") Signed-off-by: Peter Xu Co-developed-by: Mike Kravetz Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Nadav Amit Cc: David Hildenbrand Cc: Mike Rapoport Signed-off-by: Andrew Morton --- mm/hugetlb.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 7 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9a910612336d..bf9d8d04bf4f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5535,6 +5535,23 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, return handle_userfault(&vmf, reason); } +/* + * Recheck pte with pgtable lock. Returns true if pte didn't change, or + * false if pte changed or is changing. + */ +static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, + pte_t *ptep, pte_t old_pte) +{ + spinlock_t *ptl; + bool same; + + ptl = huge_pte_lock(h, mm, ptep); + same = pte_same(huge_ptep_get(ptep), old_pte); + spin_unlock(ptl); + + return same; +} + static vm_fault_t hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, @@ -5575,10 +5592,33 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (idx >= size) goto out; /* Check for page in userfault range */ - if (userfaultfd_missing(vma)) - return hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, address, - VM_UFFD_MISSING); + if (userfaultfd_missing(vma)) { + /* + * Since hugetlb_no_page() was examining pte + * without pgtable lock, we need to re-test under + * lock because the pte may not be stable and could + * have changed from under us. Try to detect + * either changed or during-changing ptes and retry + * properly when needed. + * + * Note that userfaultfd is actually fine with + * false positives (e.g. caused by pte changed), + * but not wrong logical events (e.g. caused by + * reading a pte during changing). The latter can + * confuse the userspace, so the strictness is very + * much preferred. E.g., MISSING event should + * never happen on the page after UFFDIO_COPY has + * correctly installed the page and returned. + */ + if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + ret = 0; + goto out; + } + + return hugetlb_handle_userfault(vma, mapping, idx, flags, + haddr, address, + VM_UFFD_MISSING); + } page = alloc_huge_page(vma, haddr, 0); if (IS_ERR(page)) { @@ -5644,9 +5684,14 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (userfaultfd_minor(vma)) { unlock_page(page); put_page(page); - return hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, address, - VM_UFFD_MINOR); + /* See comment in userfaultfd_missing() block above */ + if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + ret = 0; + goto out; + } + return hugetlb_handle_userfault(vma, mapping, idx, flags, + haddr, address, + VM_UFFD_MINOR); } } -- cgit v1.2.3-59-g8ed1b From f9bf6c03eca1077cae8de0e6d86427656fa42a9b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 Oct 2022 15:33:59 -0400 Subject: mm/hugetlb: use hugetlb_pte_stable in migration race check After hugetlb_pte_stable() introduced, we can also rewrite the migration race condition against page allocation to use the new helper too. Link: https://lkml.kernel.org/r/20221004193400.110155-3-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- mm/hugetlb.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bf9d8d04bf4f..9b26055f3119 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5634,11 +5634,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * here. Before returning error, get ptl and make * sure there really is no pte entry. */ - ptl = huge_pte_lock(h, mm, ptep); - ret = 0; - if (huge_pte_none(huge_ptep_get(ptep))) + if (hugetlb_pte_stable(h, mm, ptep, old_pte)) ret = vmf_error(PTR_ERR(page)); - spin_unlock(ptl); + else + ret = 0; goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); -- cgit v1.2.3-59-g8ed1b