From bc0c3357601e3ff1b006600530079bd246ef0d82 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 23 Aug 2023 19:05:56 +0200 Subject: mm: remove remnants of SPLIT_RSS_COUNTING The feature got retired in f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter"), but the patch failed to fully clean it up. Link: https://lkml.kernel.org/r/20230823170556.2281747-1-mjguzik@gmail.com Signed-off-by: Mateusz Guzik Acked-by: Shakeel Butt Signed-off-by: Andrew Morton --- kernel/fork.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 3b6d20dfb9a8..1779183a7cb3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2406,10 +2406,6 @@ __latent_entropy struct task_struct *copy_process( p->io_uring = NULL; #endif -#if defined(SPLIT_RSS_COUNTING) - memset(&p->rss_stat, 0, sizeof(p->rss_stat)); -#endif - p->default_timer_slack_ns = current->timer_slack_ns; #ifdef CONFIG_PSI -- cgit v1.2.3-59-g8ed1b From 24e41bf8a6b424c76c5902fb999e9eca61bdf83d Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 28 Aug 2023 17:08:57 +0200 Subject: mm: add a NO_INHERIT flag to the PR_SET_MDWE prctl This extends the current PR_SET_MDWE prctl arg with a bit to indicate that the process doesn't want MDWE protection to propagate to children. To implement this no-inherit mode, the tag in current->mm->flags must be absent from MMF_INIT_MASK. This means that the encoding for "MDWE but without inherit" is different in the prctl than in the mm flags. This leads to a bit of bit-mangling in the prctl implementation. Link: https://lkml.kernel.org/r/20230828150858.393570-6-revest@chromium.org Signed-off-by: Florent Revest Reviewed-by: Kees Cook Reviewed-by: Catalin Marinas Cc: Alexey Izbyshev Cc: Anshuman Khandual Cc: Ayush Jain Cc: David Hildenbrand Cc: Greg Thelen Cc: Joey Gouly Cc: KP Singh Cc: Mark Brown Cc: Michal Hocko Cc: Peter Xu Cc: Ryan Roberts Cc: Szabolcs Nagy Cc: Topi Miettinen Signed-off-by: Andrew Morton --- include/linux/sched/coredump.h | 10 ++++++++++ include/uapi/linux/prctl.h | 1 + kernel/fork.c | 2 +- kernel/sys.c | 32 ++++++++++++++++++++++++++------ tools/include/uapi/linux/prctl.h | 1 + 5 files changed, 39 insertions(+), 7 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 0ee96ea7a0e9..1b37fa8fc723 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -91,4 +91,14 @@ static inline int get_dumpable(struct mm_struct *mm) MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK) #define MMF_VM_MERGE_ANY 29 +#define MMF_HAS_MDWE_NO_INHERIT 30 + +static inline unsigned long mmf_init_flags(unsigned long flags) +{ + if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) + flags &= ~((1UL << MMF_HAS_MDWE) | + (1UL << MMF_HAS_MDWE_NO_INHERIT)); + return flags & MMF_INIT_MASK; +} + #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 9a85c69782bd..370ed14b1ae0 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -284,6 +284,7 @@ struct prctl_mm_map { /* Memory deny write / execute */ #define PR_SET_MDWE 65 # define PR_MDWE_REFUSE_EXEC_GAIN (1UL << 0) +# define PR_MDWE_NO_INHERIT (1UL << 1) #define PR_GET_MDWE 66 diff --git a/kernel/fork.c b/kernel/fork.c index 1779183a7cb3..e45a4457ba83 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1288,7 +1288,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, hugetlb_count_init(mm); if (current->mm) { - mm->flags = current->mm->flags & MMF_INIT_MASK; + mm->flags = mmf_init_flags(current->mm->flags); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { mm->flags = default_dump_filter; diff --git a/kernel/sys.c b/kernel/sys.c index 2410e3999ebe..4a8073c1b255 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2368,19 +2368,41 @@ static int prctl_set_vma(unsigned long opt, unsigned long start, } #endif /* CONFIG_ANON_VMA_NAME */ +static inline unsigned long get_current_mdwe(void) +{ + unsigned long ret = 0; + + if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + ret |= PR_MDWE_REFUSE_EXEC_GAIN; + if (test_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags)) + ret |= PR_MDWE_NO_INHERIT; + + return ret; +} + static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, unsigned long arg4, unsigned long arg5) { + unsigned long current_bits; + if (arg3 || arg4 || arg5) return -EINVAL; - if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN)) + if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT)) + return -EINVAL; + + /* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */ + if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN)) return -EINVAL; + current_bits = get_current_mdwe(); + if (current_bits && current_bits != bits) + return -EPERM; /* Cannot unset the flags */ + + if (bits & PR_MDWE_NO_INHERIT) + set_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags); if (bits & PR_MDWE_REFUSE_EXEC_GAIN) set_bit(MMF_HAS_MDWE, ¤t->mm->flags); - else if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) - return -EPERM; /* Cannot unset the flag */ return 0; } @@ -2390,9 +2412,7 @@ static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3, { if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - - return test_bit(MMF_HAS_MDWE, ¤t->mm->flags) ? - PR_MDWE_REFUSE_EXEC_GAIN : 0; + return get_current_mdwe(); } static int prctl_get_auxv(void __user *addr, unsigned long len) diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 9a85c69782bd..370ed14b1ae0 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -284,6 +284,7 @@ struct prctl_mm_map { /* Memory deny write / execute */ #define PR_SET_MDWE 65 # define PR_MDWE_REFUSE_EXEC_GAIN (1UL << 0) +# define PR_MDWE_NO_INHERIT (1UL << 1) #define PR_GET_MDWE 66 -- cgit v1.2.3-59-g8ed1b From e8e17ee90eaf650c855adb0a3e5e965fd6692ff1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Oct 2023 18:04:28 +0100 Subject: mm: drop the assumption that VM_SHARED always implies writable Patch series "permit write-sealed memfd read-only shared mappings", v4. The man page for fcntl() describing memfd file seals states the following about F_SEAL_WRITE:- Furthermore, trying to create new shared, writable memory-mappings via mmap(2) will also fail with EPERM. With emphasis on 'writable'. In turns out in fact that currently the kernel simply disallows all new shared memory mappings for a memfd with F_SEAL_WRITE applied, rendering this documentation inaccurate. This matters because users are therefore unable to obtain a shared mapping to a memfd after write sealing altogether, which limits their usefulness. This was reported in the discussion thread [1] originating from a bug report [2]. This is a product of both using the struct address_space->i_mmap_writable atomic counter to determine whether writing may be permitted, and the kernel adjusting this counter when any VM_SHARED mapping is performed and more generally implicitly assuming VM_SHARED implies writable. It seems sensible that we should only update this mapping if VM_MAYWRITE is specified, i.e. whether it is possible that this mapping could at any point be written to. If we do so then all we need to do to permit write seals to function as documented is to clear VM_MAYWRITE when mapping read-only. It turns out this functionality already exists for F_SEAL_FUTURE_WRITE - we can therefore simply adapt this logic to do the same for F_SEAL_WRITE. We then hit a chicken and egg situation in mmap_region() where the check for VM_MAYWRITE occurs before we are able to clear this flag. To work around this, perform this check after we invoke call_mmap(), with careful consideration of error paths. Thanks to Andy Lutomirski for the suggestion! [1]:https://lore.kernel.org/all/20230324133646.16101dfa666f253c4715d965@linux-foundation.org/ [2]:https://bugzilla.kernel.org/show_bug.cgi?id=217238 This patch (of 3): There is a general assumption that VMAs with the VM_SHARED flag set are writable. If the VM_MAYWRITE flag is not set, then this is simply not the case. Update those checks which affect the struct address_space->i_mmap_writable field to explicitly test for this by introducing [vma_]is_shared_maywrite() helper functions. This remains entirely conservative, as the lack of VM_MAYWRITE guarantees that the VMA cannot be written to. Link: https://lkml.kernel.org/r/cover.1697116581.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/d978aefefa83ec42d18dfa964ad180dbcde34795.1697116581.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Suggested-by: Andy Lutomirski Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Christian Brauner Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/fs.h | 4 ++-- include/linux/mm.h | 11 +++++++++++ kernel/fork.c | 2 +- mm/filemap.c | 2 +- mm/madvise.c | 2 +- mm/mmap.c | 12 ++++++------ 6 files changed, 22 insertions(+), 11 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/fs.h b/include/linux/fs.h index fd539c9fef8e..5265186da0e2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -454,7 +454,7 @@ extern const struct address_space_operations empty_aops; * It is also used to block modification of page cache contents through * memory mappings. * @gfp_mask: Memory allocation flags to use for allocating pages. - * @i_mmap_writable: Number of VM_SHARED mappings. + * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings. * @nr_thps: Number of THPs in the pagecache (non-shmem only). * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. @@ -557,7 +557,7 @@ static inline int mapping_mapped(struct address_space *mapping) /* * Might pages of this file have been modified in userspace? - * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap + * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap * marks vma as VM_SHARED if it is shared, and the file was opened for * writing i.e. vma may be mprotected writable even if now readonly. * diff --git a/include/linux/mm.h b/include/linux/mm.h index 1bddb151cd5c..c3b5749ede9d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -937,6 +937,17 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) return vma->vm_flags & VM_ACCESS_FLAGS; } +static inline bool is_shared_maywrite(vm_flags_t vm_flags) +{ + return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == + (VM_SHARED | VM_MAYWRITE); +} + +static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) +{ + return is_shared_maywrite(vma->vm_flags); +} + static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { diff --git a/kernel/fork.c b/kernel/fork.c index e45a4457ba83..1e6c656e0857 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -733,7 +733,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, get_file(file); i_mmap_lock_write(mapping); - if (tmp->vm_flags & VM_SHARED) + if (vma_is_shared_maywrite(tmp)) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ diff --git a/mm/filemap.c b/mm/filemap.c index 9ef49255f1a5..9710f43a89ac 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3618,7 +3618,7 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) */ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + if (vma_is_shared_maywrite(vma)) return -EINVAL; return generic_file_mmap(file, vma); } diff --git a/mm/madvise.c b/mm/madvise.c index a6b48d4796ba..cf4d694280e9 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -970,7 +970,7 @@ static long madvise_remove(struct vm_area_struct *vma, return -EINVAL; } - if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) + if (!vma_is_shared_maywrite(vma)) return -EACCES; offset = (loff_t)(start - vma->vm_start) diff --git a/mm/mmap.c b/mm/mmap.c index 3ea52451623b..0041e3631f6c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -107,7 +107,7 @@ void vma_set_page_prot(struct vm_area_struct *vma) static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) { - if (vma->vm_flags & VM_SHARED) + if (vma_is_shared_maywrite(vma)) mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); @@ -384,7 +384,7 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm, static void __vma_link_file(struct vm_area_struct *vma, struct address_space *mapping) { - if (vma->vm_flags & VM_SHARED) + if (vma_is_shared_maywrite(vma)) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); @@ -2846,7 +2846,7 @@ cannot_expand: vma->vm_pgoff = pgoff; if (file) { - if (vm_flags & VM_SHARED) { + if (is_shared_maywrite(vm_flags)) { error = mapping_map_writable(file->f_mapping); if (error) goto free_vma; @@ -2920,7 +2920,7 @@ cannot_expand: mm->map_count++; if (vma->vm_file) { i_mmap_lock_write(vma->vm_file->f_mapping); - if (vma->vm_flags & VM_SHARED) + if (vma_is_shared_maywrite(vma)) mapping_allow_writable(vma->vm_file->f_mapping); flush_dcache_mmap_lock(vma->vm_file->f_mapping); @@ -2937,7 +2937,7 @@ cannot_expand: /* Once vma denies write, undo our temporary denial count */ unmap_writable: - if (file && vm_flags & VM_SHARED) + if (file && is_shared_maywrite(vm_flags)) mapping_unmap_writable(file->f_mapping); file = vma->vm_file; ksm_add_vma(vma); @@ -2985,7 +2985,7 @@ unmap_and_free_vma: unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start, vma->vm_end, vma->vm_end, true); } - if (file && (vm_flags & VM_SHARED)) + if (file && is_shared_maywrite(vm_flags)) mapping_unmap_writable(file->f_mapping); free_vma: vm_area_free(vma); -- cgit v1.2.3-59-g8ed1b