From 1db9dbc2ef05205bf1022f9b14aa29b1dd8efd7e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:52 -0700 Subject: mm/uffd: PTE_MARKER_UFFD_WP This patch introduces the 1st user of pte marker: the uffd-wp marker. When the pte marker is installed with the uffd-wp bit set, it means this pte was wr-protected by uffd. We will use this special pte to arm the ptes that got either unmapped or swapped out for a file-backed region that was previously wr-protected. This special pte could trigger a page fault just like swap entries. This idea is greatly inspired by Hugh and Andrea in the discussion, which is referenced in the links below. Some helpers are introduced to detect whether a swap pte is uffd wr-protected. After the pte marker introduced, one swap pte can be wr-protected in two forms: either it is a normal swap pte and it has _PAGE_SWP_UFFD_WP set, or it's a pte marker that has PTE_MARKER_UFFD_WP set. [peterx@redhat.com: fixup] Link: https://lkml.kernel.org/r/YkzKiM8tI4+qOfXF@xz-m1.local Link: https://lore.kernel.org/lkml/20201126222359.8120-1-peterx@redhat.com/ Link: https://lore.kernel.org/lkml/20201130230603.46187-1-peterx@redhat.com/ Link: https://lkml.kernel.org/r/20220405014838.14131-1-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Andrea Arcangeli Suggested-by: Hugh Dickins Cc: Alistair Popple Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 47 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'include/linux/userfaultfd_k.h') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 33cea484d1ad..3354d9ddc35f 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -15,6 +15,8 @@ #include #include +#include +#include #include /* The set of all possible UFFD-related VM flags. */ @@ -236,4 +238,49 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, #endif /* CONFIG_USERFAULTFD */ +static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry) +{ +#ifdef CONFIG_PTE_MARKER_UFFD_WP + return is_pte_marker_entry(entry) && + (pte_marker_get(entry) & PTE_MARKER_UFFD_WP); +#else + return false; +#endif +} + +static inline bool pte_marker_uffd_wp(pte_t pte) +{ +#ifdef CONFIG_PTE_MARKER_UFFD_WP + swp_entry_t entry; + + if (!is_swap_pte(pte)) + return false; + + entry = pte_to_swp_entry(pte); + + return pte_marker_entry_uffd_wp(entry); +#else + return false; +#endif +} + +/* + * Returns true if this is a swap pte and was uffd-wp wr-protected in either + * forms (pte marker or a normal swap pte), false otherwise. + */ +static inline bool pte_swp_uffd_wp_any(pte_t pte) +{ +#ifdef CONFIG_PTE_MARKER_UFFD_WP + if (!is_swap_pte(pte)) + return false; + + if (pte_swp_uffd_wp(pte)) + return true; + + if (pte_marker_uffd_wp(pte)) + return true; +#endif + return false; +} + #endif /* _LINUX_USERFAULTFD_K_H */ -- cgit v1.3-8-gc7d7 From 9c28a205c06123b9f0a0c4d819ece9f5f552d004 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:53 -0700 Subject: mm/shmem: handle uffd-wp special pte in page fault handler File-backed memories are prone to unmap/swap so the ptes are always unstable, because they can be easily faulted back later using the page cache. This could lead to uffd-wp getting lost when unmapping or swapping out such memory. One example is shmem. PTE markers are needed to store those information. This patch prepares it by handling uffd-wp pte markers first it is applied elsewhere, so that the page fault handler can recognize uffd-wp pte markers. The handling of uffd-wp pte markers is similar to missing fault, it's just that we'll handle this "missing fault" when we see the pte markers, meanwhile we need to make sure the marker information is kept during processing the fault. This is a slow path of uffd-wp handling, because zapping of wr-protected shmem ptes should be rare. So far it should only trigger in two conditions: (1) When trying to punch holes in shmem_fallocate(), there is an optimization to zap the pgtables before evicting the page. (2) When swapping out shmem pages. Because of this, the page fault handling is simplifed too by not sending the wr-protect message in the 1st page fault, instead the page will be installed read-only, so the uffd-wp message will be generated in the next fault, which will trigger the do_wp_page() path of general uffd-wp handling. Disable fault-around for all uffd-wp registered ranges for extra safety just like uffd-minor fault, and clean the code up. Link: https://lkml.kernel.org/r/20220405014844.14239-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 17 +++++++++++ mm/memory.c | 67 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 75 insertions(+), 9 deletions(-) (limited to 'include/linux/userfaultfd_k.h') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 3354d9ddc35f..e7afcdfd4b46 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -96,6 +96,18 @@ static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); } +/* + * Don't do fault around for either WP or MINOR registered uffd range. For + * MINOR registered range, fault around will be a total disaster and ptes can + * be installed without notifications; for WP it should mostly be fine as long + * as the fault around checks for pte_none() before the installation, however + * to be super safe we just forbid it. + */ +static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); +} + static inline bool userfaultfd_missing(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_MISSING; @@ -236,6 +248,11 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, { } +static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) +{ + return false; +} + #endif /* CONFIG_USERFAULTFD */ static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry) diff --git a/mm/memory.c b/mm/memory.c index 878da420d97b..c16f873373a2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3559,6 +3559,39 @@ static inline bool should_try_to_free_swap(struct page *page, page_count(page) == 2; } +static vm_fault_t pte_marker_clear(struct vm_fault *vmf) +{ + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + /* + * Be careful so that we will only recover a special uffd-wp pte into a + * none pte. Otherwise it means the pte could have changed, so retry. + */ + if (is_pte_marker(*vmf->pte)) + pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; +} + +/* + * This is actually a page-missing access, but with uffd-wp special pte + * installed. It means this pte was wr-protected before being unmapped. + */ +static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) +{ + /* + * Just in case there're leftover special ptes even after the region + * got unregistered - we can simply clear them. We can also do that + * proactively when e.g. when we do UFFDIO_UNREGISTER upon some uffd-wp + * ranges, but it should be more efficient to be done lazily here. + */ + if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma))) + return pte_marker_clear(vmf); + + /* do_fault() can handle pte markers too like none pte */ + return do_fault(vmf); +} + static vm_fault_t handle_pte_marker(struct vm_fault *vmf) { swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); @@ -3572,8 +3605,11 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker)) return VM_FAULT_SIGBUS; - /* TODO: handle pte markers */ - return 0; + if (pte_marker_entry_uffd_wp(entry)) + return pte_marker_handle_uffd_wp(vmf); + + /* This is an unknown pte marker */ + return VM_FAULT_SIGBUS; } /* @@ -4157,6 +4193,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) { struct vm_area_struct *vma = vmf->vma; + bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); bool write = vmf->flags & FAULT_FLAG_WRITE; bool prefault = vmf->address != addr; pte_t entry; @@ -4171,6 +4208,8 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (unlikely(uffd_wp)) + entry = pte_mkuffd_wp(pte_wrprotect(entry)); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); @@ -4352,9 +4391,21 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); } +/* Return true if we should do read fault-around, false otherwise */ +static inline bool should_fault_around(struct vm_fault *vmf) +{ + /* No ->map_pages? No way to fault around... */ + if (!vmf->vma->vm_ops->map_pages) + return false; + + if (uffd_disable_fault_around(vmf->vma)) + return false; + + return fault_around_bytes >> PAGE_SHIFT > 1; +} + static vm_fault_t do_read_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; /* @@ -4362,12 +4413,10 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) * if page by the offset is not ready to be mapped (cold cache or * something). */ - if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - if (likely(!userfaultfd_minor(vmf->vma))) { - ret = do_fault_around(vmf); - if (ret) - return ret; - } + if (should_fault_around(vmf)) { + ret = do_fault_around(vmf); + if (ret) + return ret; } ret = __do_fault(vmf); -- cgit v1.3-8-gc7d7 From b1f9e876862d8f7176299ec4fb2108bc1045cbc8 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 12 May 2022 20:22:56 -0700 Subject: mm/uffd: enable write protection for shmem & hugetlbfs We've had all the necessary changes ready for both shmem and hugetlbfs. Turn on all the shmem/hugetlbfs switches for userfaultfd-wp. We can expand UFFD_API_RANGE_IOCTLS_BASIC with _UFFDIO_WRITEPROTECT too because all existing types now support write protection mode. Since vma_can_userfault() will be used elsewhere, move into userfaultfd_k.h. Link: https://lkml.kernel.org/r/20220405014926.15101-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 21 +++------------------ include/linux/userfaultfd_k.h | 20 ++++++++++++++++++++ include/uapi/linux/userfaultfd.h | 10 ++++++++-- mm/userfaultfd.c | 9 +++------ 4 files changed, 34 insertions(+), 26 deletions(-) (limited to 'include/linux/userfaultfd_k.h') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 78b68e0f9774..e943370107d0 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1258,24 +1258,6 @@ static __always_inline int validate_range(struct mm_struct *mm, return 0; } -static inline bool vma_can_userfault(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - /* FIXME: add WP support to hugetlbfs and shmem */ - if (vm_flags & VM_UFFD_WP) { - if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) - return false; - } - - if (vm_flags & VM_UFFD_MINOR) { - if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) - return false; - } - - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); -} - static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long arg) { @@ -1956,6 +1938,9 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, #endif #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; +#endif +#ifndef CONFIG_PTE_MARKER_UFFD_WP + uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; #endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e7afcdfd4b46..732b522bacb7 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -18,6 +18,7 @@ #include #include #include +#include /* The set of all possible UFFD-related VM flags. */ #define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) @@ -140,6 +141,25 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) return vma->vm_flags & __VM_UFFD_FLAGS; } +static inline bool vma_can_userfault(struct vm_area_struct *vma, + unsigned long vm_flags) +{ + if (vm_flags & VM_UFFD_MINOR) + return is_vm_hugetlb_page(vma) || vma_is_shmem(vma); + +#ifndef CONFIG_PTE_MARKER_UFFD_WP + /* + * If user requested uffd-wp but not enabled pte markers for + * uffd-wp, then shmem & hugetlbfs are not supported but only + * anonymous. + */ + if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) + return false; +#endif + return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || + vma_is_shmem(vma); +} + extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index ef739054cb1c..7d32b1e797fb 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -33,7 +33,8 @@ UFFD_FEATURE_THREAD_ID | \ UFFD_FEATURE_MINOR_HUGETLBFS | \ UFFD_FEATURE_MINOR_SHMEM | \ - UFFD_FEATURE_EXACT_ADDRESS) + UFFD_FEATURE_EXACT_ADDRESS | \ + UFFD_FEATURE_WP_HUGETLBFS_SHMEM) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -47,7 +48,8 @@ #define UFFD_API_RANGE_IOCTLS_BASIC \ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ - (__u64)1 << _UFFDIO_CONTINUE) + (__u64)1 << _UFFDIO_CONTINUE | \ + (__u64)1 << _UFFDIO_WRITEPROTECT) /* * Valid ioctl command number range with this API is from 0x00 to @@ -194,6 +196,9 @@ struct uffdio_api { * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page * faults would be provided and the offset within the page would not be * masked. + * + * UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd + * write-protection mode is supported on both shmem and hugetlbfs. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -207,6 +212,7 @@ struct uffdio_api { #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) #define UFFD_FEATURE_MINOR_SHMEM (1<<10) #define UFFD_FEATURE_EXACT_ADDRESS (1<<11) +#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) __u64 features; __u64 ioctls; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 01edc18902c5..4f4892a5f767 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -732,15 +732,12 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, err = -ENOENT; dst_vma = find_dst_vma(dst_mm, start, len); - /* - * Make sure the vma is not shared, that the dst range is - * both valid and fully within a single existing vma. - */ - if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) + + if (!dst_vma) goto out_unlock; if (!userfaultfd_wp(dst_vma)) goto out_unlock; - if (!vma_is_anonymous(dst_vma)) + if (!vma_can_userfault(dst_vma, dst_vma->vm_flags)) goto out_unlock; if (is_vm_hugetlb_page(dst_vma)) { -- cgit v1.3-8-gc7d7