diff options
author | 2025-08-05 16:02:07 +0300 | |
---|---|---|
committer | 2025-08-05 16:02:07 +0300 | |
commit | da23ea194db94257123f1534d487f3cdc9b5626d (patch) | |
tree | a6c069c9f5b55ebd81771e4f7871e174b1e034c7 | |
parent | Merge tag 'i2c-for-6.17-rc1-part2' of git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux (diff) | |
parent | mm: mempool: fix crash in mempool_free() for zero-minimum pools (diff) | |
download | wireguard-linux-da23ea194db94257123f1534d487f3cdc9b5626d.tar.xz wireguard-linux-da23ea194db94257123f1534d487f3cdc9b5626d.zip |
Merge tag 'mm-stable-2025-08-03-12-35' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull more MM updates from Andrew Morton:
"Significant patch series in this pull request:
- "mseal cleanups" (Lorenzo Stoakes)
Some mseal cleaning with no intended functional change.
- "Optimizations for khugepaged" (David Hildenbrand)
Improve khugepaged throughput by batching PTE operations for large
folios. This gain is mainly for arm64.
- "x86: enable EXECMEM_ROX_CACHE for ftrace and kprobes" (Mike Rapoport)
A bugfix, additional debug code and cleanups to the execmem code.
- "mm/shmem, swap: bugfix and improvement of mTHP swap in" (Kairui Song)
Bugfixes, cleanups and performance improvememnts to the mTHP swapin
code"
* tag 'mm-stable-2025-08-03-12-35' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (38 commits)
mm: mempool: fix crash in mempool_free() for zero-minimum pools
mm: correct type for vmalloc vm_flags fields
mm/shmem, swap: fix major fault counting
mm/shmem, swap: rework swap entry and index calculation for large swapin
mm/shmem, swap: simplify swapin path and result handling
mm/shmem, swap: never use swap cache and readahead for SWP_SYNCHRONOUS_IO
mm/shmem, swap: tidy up swap entry splitting
mm/shmem, swap: tidy up THP swapin checks
mm/shmem, swap: avoid redundant Xarray lookup during swapin
x86/ftrace: enable EXECMEM_ROX_CACHE for ftrace allocations
x86/kprobes: enable EXECMEM_ROX_CACHE for kprobes allocations
execmem: drop writable parameter from execmem_fill_trapping_insns()
execmem: add fallback for failures in vmalloc(VM_ALLOW_HUGE_VMAP)
execmem: move execmem_force_rw() and execmem_restore_rox() before use
execmem: rework execmem_cache_free()
execmem: introduce execmem_alloc_rw()
execmem: drop unused execmem_update_copy()
mm: fix a UAF when vma->mm is freed after vma->vm_refcnt got dropped
mm/rmap: add anon_vma lifetime debug check
mm: remove mm/io-mapping.c
...
42 files changed, 1080 insertions, 511 deletions
diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 50cfc7842930..5063179cfc70 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -133,4 +133,3 @@ More Memory Management Functions .. kernel-doc:: mm/mmu_notifier.c .. kernel-doc:: mm/balloon_compaction.c .. kernel-doc:: mm/huge_memory.c -.. kernel-doc:: mm/io-mapping.c diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index abd9725796e9..34e5d78af076 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -721,7 +721,7 @@ void mark_rodata_ro(void) static void __init declare_vma(struct vm_struct *vma, void *va_start, void *va_end, - vm_flags_t vm_flags) + unsigned long vm_flags) { phys_addr_t pa_start = __pa_symbol(va_start); unsigned long size = va_end - va_start; @@ -1528,7 +1528,7 @@ early_initcall(prevent_bootmem_remove_init); pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { - pte_t pte = get_and_clear_full_ptes(vma->vm_mm, addr, ptep, nr, /* full = */ 0); + pte_t pte = get_and_clear_ptes(vma->vm_mm, addr, ptep, nr); if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { /* diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9f6b7dab2d9a..7bde68247b5f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -120,7 +120,7 @@ struct its_array its_pages; static void *__its_alloc(struct its_array *pages) { - void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); + void *page __free(execmem) = execmem_alloc_rw(EXECMEM_MODULE_TEXT, PAGE_SIZE); if (!page) return NULL; @@ -237,7 +237,6 @@ static void *its_alloc(void) if (!page) return NULL; - execmem_make_temp_rw(page, PAGE_SIZE); if (pages == &its_pages) set_memory_x((unsigned long)page, 1); diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 252e82bcfd2f..4450acec9390 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -263,7 +263,7 @@ void arch_ftrace_update_code(int command) static inline void *alloc_tramp(unsigned long size) { - return execmem_alloc(EXECMEM_FTRACE, size); + return execmem_alloc_rw(EXECMEM_FTRACE, size); } static inline void tramp_free(void *tramp) { diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 47cb8eb138ba..6079d15dab8c 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -481,24 +481,6 @@ static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p, return len; } -/* Make page to RO mode when allocate it */ -void *alloc_insn_page(void) -{ - void *page; - - page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); - if (!page) - return NULL; - - /* - * TODO: Once additional kernel code protection mechanisms are set, ensure - * that the page was not maliciously altered and it is still zeroed. - */ - set_memory_rox((unsigned long)page, 1); - - return page; -} - /* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */ static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7456df985d96..bb57e93b4caf 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1063,13 +1063,9 @@ unsigned long arch_max_swapfile_size(void) static struct execmem_info execmem_info __ro_after_init; #ifdef CONFIG_ARCH_HAS_EXECMEM_ROX -void execmem_fill_trapping_insns(void *ptr, size_t size, bool writeable) +void execmem_fill_trapping_insns(void *ptr, size_t size) { - /* fill memory with INT3 instructions */ - if (writeable) - memset(ptr, INT3_INSN_OPCODE, size); - else - text_poke_set(ptr, INT3_INSN_OPCODE, size); + memset(ptr, INT3_INSN_OPCODE, size); } #endif @@ -1102,7 +1098,21 @@ struct execmem_info __init *execmem_arch_setup(void) .pgprot = pgprot, .alignment = MODULE_ALIGN, }, - [EXECMEM_KPROBES ... EXECMEM_BPF] = { + [EXECMEM_KPROBES] = { + .flags = flags, + .start = start, + .end = MODULES_END, + .pgprot = PAGE_KERNEL_ROX, + .alignment = MODULE_ALIGN, + }, + [EXECMEM_FTRACE] = { + .flags = flags, + .start = start, + .end = MODULES_END, + .pgprot = pgprot, + .alignment = MODULE_ALIGN, + }, + [EXECMEM_BPF] = { .flags = EXECMEM_KASAN_SHADOW, .start = start, .end = MODULES_END, diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 3be35680a54f..7de229134e30 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -60,27 +60,11 @@ enum execmem_range_flags { * will trap * @ptr: pointer to memory to fill * @size: size of the range to fill - * @writable: is the memory poited by @ptr is writable or ROX * * A hook for architecures to fill execmem ranges with invalid instructions. * Architectures that use EXECMEM_ROX_CACHE must implement this. */ -void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable); - -/** - * execmem_make_temp_rw - temporarily remap region with read-write - * permissions - * @ptr: address of the region to remap - * @size: size of the region to remap - * - * Remaps a part of the cached large page in the ROX cache in the range - * [@ptr, @ptr + @size) as writable and not executable. The caller must - * have exclusive ownership of this range and ensure nothing will try to - * execute code in this range. - * - * Return: 0 on success or negative error code on failure. - */ -int execmem_make_temp_rw(void *ptr, size_t size); +void execmem_fill_trapping_insns(void *ptr, size_t size); /** * execmem_restore_rox - restore read-only-execute permissions @@ -95,7 +79,6 @@ int execmem_make_temp_rw(void *ptr, size_t size); */ int execmem_restore_rox(void *ptr, size_t size); #else -static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; } static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; } #endif @@ -166,6 +149,28 @@ struct execmem_info *execmem_arch_setup(void); void *execmem_alloc(enum execmem_type type, size_t size); /** + * execmem_alloc_rw - allocate writable executable memory + * @type: type of the allocation + * @size: how many bytes of memory are required + * + * Allocates memory that will contain executable code, either generated or + * loaded from kernel modules. + * + * Allocates memory that will contain data coupled with executable code, + * like data sections in kernel modules. + * + * Forces writable permissions on the allocated memory and the caller is + * responsible to manage the permissions afterwards. + * + * For architectures that use ROX cache the permissions will be set to R+W. + * For architectures that don't use ROX cache the default permissions for @type + * will be used as they must be writable. + * + * Return: a pointer to the allocated memory or %NULL + */ +void *execmem_alloc_rw(enum execmem_type type, size_t size); + +/** * execmem_free - free executable memory * @ptr: pointer to the memory that should be freed */ @@ -186,19 +191,6 @@ struct vm_struct *execmem_vmap(size_t size); #endif /** - * execmem_update_copy - copy an update to executable memory - * @dst: destination address to update - * @src: source address containing the data - * @size: how many bytes of memory shold be copied - * - * Copy @size bytes from @src to @dst using text poking if the memory at - * @dst is read-only. - * - * Return: a pointer to @dst or NULL on error - */ -void *execmem_update_copy(void *dst, const void *src, size_t size); - -/** * execmem_is_rox - check if execmem is read-only * @type - the execmem type to check * diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 7376c1df9c90..c16353cc6e3c 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -225,7 +225,4 @@ io_mapping_free(struct io_mapping *iomap) kfree(iomap); } -int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn, unsigned long size); - #endif /* _LINUX_IO_MAPPING_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 349f0d9aad22..1ae97a0b8ec7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -414,8 +414,10 @@ extern unsigned int kobjsize(const void *objp); #endif #ifdef CONFIG_64BIT -/* VM is sealed, in vm_flags */ -#define VM_SEALED _BITUL(63) +#define VM_SEALED_BIT 42 +#define VM_SEALED BIT(VM_SEALED_BIT) +#else +#define VM_SEALED VM_NONE #endif /* Bits set in the VMA until the stack is in its final location */ diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 1f4f44951abe..11a078de9150 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -12,6 +12,7 @@ extern int rcuwait_wake_up(struct rcuwait *w); #include <linux/tracepoint-defs.h> #include <linux/types.h> #include <linux/cleanup.h> +#include <linux/sched/mm.h> #define MMAP_LOCK_INITIALIZER(name) \ .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), @@ -154,6 +155,10 @@ static inline void vma_refcount_put(struct vm_area_struct *vma) * reused and attached to a different mm before we lock it. * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got * detached. + * + * WARNING! The vma passed to this function cannot be used if the function + * fails to lock it because in certain cases RCU lock is dropped and then + * reacquired. Once RCU lock is dropped the vma can be concurently freed. */ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, struct vm_area_struct *vma) @@ -183,6 +188,31 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, } rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + + /* + * If vma got attached to another mm from under us, that mm is not + * stable and can be freed in the narrow window after vma->vm_refcnt + * is dropped and before rcuwait_wake_up(mm) is called. Grab it before + * releasing vma->vm_refcnt. + */ + if (unlikely(vma->vm_mm != mm)) { + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + struct mm_struct *other_mm = vma->vm_mm; + + /* + * __mmdrop() is a heavy operation and we don't need RCU + * protection here. Release RCU lock during these operations. + * We reinstate the RCU read lock as the caller expects it to + * be held when this function returns even on error. + */ + rcu_read_unlock(); + mmgrab(other_mm); + vma_refcount_put(vma); + mmdrop(other_mm); + rcu_read_lock(); + return NULL; + } + /* * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. * False unlocked result is impossible because we modify and check diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8e4d6eda8a8d..8d3fa3a91ce4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -837,8 +837,6 @@ void set_page_writeback(struct page *page); #define folio_start_writeback(folio) \ __folio_start_writeback(folio, false) -#define folio_start_writeback_keepwrite(folio) \ - __folio_start_writeback(folio, true) static __always_inline bool folio_test_head(const struct folio *folio) { diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e3b99920be05..4c035637eeb7 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -736,6 +736,29 @@ static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, } #endif +/** + * get_and_clear_ptes - Clear present PTEs that map consecutive pages of + * the same folio, collecting dirty/accessed bits. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * + * Use this instead of get_and_clear_full_ptes() if it is known that we don't + * need to clear the full mm, which is mostly the case. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline pte_t get_and_clear_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + return get_and_clear_full_ptes(mm, addr, ptep, nr, 0); +} + #ifndef clear_full_ptes /** * clear_full_ptes - Clear present PTEs that map consecutive pages of the same @@ -768,6 +791,28 @@ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, } #endif +/** + * clear_ptes - Clear present PTEs that map consecutive pages of the same folio. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * + * Use this instead of clear_full_ptes() if it is known that we don't need to + * clear the full mm, which is mostly the case. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void clear_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + clear_full_ptes(mm, addr, ptep, nr, 0); +} + /* * If two threads concurrently fault at the same page, the thread that * won the race updates the PTE and its local TLB/Cache. The other thread diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 20803fcb49a7..6cd020eea37a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -449,6 +449,28 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, default: VM_WARN_ON_ONCE(true); } + + /* + * Anon folios must have an associated live anon_vma as long as they're + * mapped into userspace. + * Note that the atomic_read() mainly does two things: + * + * 1. In KASAN builds with CONFIG_SLUB_RCU_DEBUG, it causes KASAN to + * check that the associated anon_vma has not yet been freed (subject + * to KASAN's usual limitations). This check will pass if the + * anon_vma's refcount has already dropped to 0 but an RCU grace + * period hasn't passed since then. + * 2. If the anon_vma has not yet been freed, it checks that the + * anon_vma still has a nonzero refcount (as opposed to being in the + * middle of an RCU delay for getting freed). + */ + if (folio_test_anon(folio) && !folio_test_ksm(folio)) { + unsigned long mapping = (unsigned long)folio->mapping; + struct anon_vma *anon_vma; + + anon_vma = (void *)(mapping - FOLIO_MAPPING_ANON); + VM_WARN_ON_FOLIO(atomic_read(&anon_vma->refcount) == 0, folio); + } } /* diff --git a/kernel/fork.c b/kernel/fork.c index 9ce93fd20f82..c4ada32598bd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -585,9 +585,12 @@ static void check_mm(struct mm_struct *mm) for (i = 0; i < NR_MM_COUNTERS; i++) { long x = percpu_counter_sum(&mm->rss_stat[i]); - if (unlikely(x)) - pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", - mm, resident_page_types[i], x); + if (unlikely(x)) { + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n", + mm, resident_page_types[i], x, + current->comm, + task_pid_nr(current)); + } } if (mm_pgtables_bytes(mm)) diff --git a/kernel/module/main.c b/kernel/module/main.c index 7f8bb51aedd4..c66b26184936 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1322,20 +1322,11 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) else execmem_type = EXECMEM_MODULE_TEXT; - ptr = execmem_alloc(execmem_type, size); + ptr = execmem_alloc_rw(execmem_type, size); if (!ptr) return -ENOMEM; - if (execmem_is_rox(execmem_type)) { - int err = execmem_make_temp_rw(ptr, size); - - if (err) { - execmem_free(ptr); - return -ENOMEM; - } - - mod->mem[type].is_rox = true; - } + mod->mem[type].is_rox = execmem_is_rox(execmem_type); /* * The pointer to these blocks of memory are stored on the module diff --git a/mm/Kconfig b/mm/Kconfig index d5d4eca947a6..e443fe8cd6cf 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1242,10 +1242,6 @@ config KMAP_LOCAL config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY bool -# struct io_mapping based helper. Selected by drivers that need them -config IO_MAPPING - bool - config MEMFD_CREATE bool "Enable memfd_create() system call" if EXPERT diff --git a/mm/Makefile b/mm/Makefile index 1a7a11d4933d..ef54aa615d9d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -141,7 +141,6 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o -obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 94af19c4dfed..87e825349bdf 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -711,6 +711,10 @@ static void damos_va_migrate_dests_add(struct folio *folio, target -= dests->weight_arr[i]; } + /* If the folio is already in the right node, don't do anything */ + if (folio_nid(folio) == dests->node_id_arr[i]) + return; + isolate: if (!folio_isolate_lru(folio)) return; diff --git a/mm/execmem.c b/mm/execmem.c index 627e6cf64f4f..0822305413ec 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -26,7 +26,7 @@ static struct execmem_info default_execmem_info __ro_after_init; #ifdef CONFIG_MMU static void *execmem_vmalloc(struct execmem_range *range, size_t size, - pgprot_t pgprot, vm_flags_t vm_flags) + pgprot_t pgprot, unsigned long vm_flags) { bool kasan = range->flags & EXECMEM_KASAN_SHADOW; gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN; @@ -82,7 +82,7 @@ struct vm_struct *execmem_vmap(size_t size) } #else static void *execmem_vmalloc(struct execmem_range *range, size_t size, - pgprot_t pgprot, vm_flags_t vm_flags) + pgprot_t pgprot, unsigned long vm_flags) { return vmalloc(size); } @@ -93,8 +93,15 @@ struct execmem_cache { struct mutex mutex; struct maple_tree busy_areas; struct maple_tree free_areas; + unsigned int pending_free_cnt; /* protected by mutex */ }; +/* delay to schedule asynchronous free if fast path free fails */ +#define FREE_DELAY (msecs_to_jiffies(10)) + +/* mark entries in busy_areas that should be freed asynchronously */ +#define PENDING_FREE_MASK (1 << (PAGE_SHIFT - 1)) + static struct execmem_cache execmem_cache = { .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex), .busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN, @@ -130,6 +137,27 @@ err_restore: return err; } +static int execmem_force_rw(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + int ret; + + ret = set_memory_nx(addr, nr); + if (ret) + return ret; + + return set_memory_rw(addr, nr); +} + +int execmem_restore_rox(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + + return set_memory_rox(addr, nr); +} + static void execmem_cache_clean(struct work_struct *work) { struct maple_tree *free_areas = &execmem_cache.free_areas; @@ -155,20 +183,17 @@ static void execmem_cache_clean(struct work_struct *work) static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean); -static int execmem_cache_add(void *ptr, size_t size) +static int execmem_cache_add_locked(void *ptr, size_t size, gfp_t gfp_mask) { struct maple_tree *free_areas = &execmem_cache.free_areas; - struct mutex *mutex = &execmem_cache.mutex; unsigned long addr = (unsigned long)ptr; MA_STATE(mas, free_areas, addr - 1, addr + 1); unsigned long lower, upper; void *area = NULL; - int err; lower = addr; upper = addr + size - 1; - mutex_lock(mutex); area = mas_walk(&mas); if (area && mas.last == addr - 1) lower = mas.index; @@ -178,12 +203,14 @@ static int execmem_cache_add(void *ptr, size_t size) upper = mas.last; mas_set_range(&mas, lower, upper); - err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL); - mutex_unlock(mutex); - if (err) - return err; + return mas_store_gfp(&mas, (void *)lower, gfp_mask); +} - return 0; +static int execmem_cache_add(void *ptr, size_t size, gfp_t gfp_mask) +{ + guard(mutex)(&execmem_cache.mutex); + + return execmem_cache_add_locked(ptr, size, gfp_mask); } static bool within_range(struct execmem_range *range, struct ma_state *mas, @@ -256,7 +283,7 @@ out_unlock: static int execmem_cache_populate(struct execmem_range *range, size_t size) { - vm_flags_t vm_flags = VM_ALLOW_HUGE_VMAP; + unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; @@ -264,6 +291,11 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) alloc_size = round_up(size, PMD_SIZE); p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags); + if (!p) { + alloc_size = size; + p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags); + } + if (!p) return err; @@ -272,13 +304,13 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) goto err_free_mem; /* fill memory with instructions that will trap */ - execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); + execmem_fill_trapping_insns(p, alloc_size); err = set_memory_rox((unsigned long)p, vm->nr_pages); if (err) goto err_free_mem; - err = execmem_cache_add(p, alloc_size); + err = execmem_cache_add(p, alloc_size, GFP_KERNEL); if (err) goto err_reset_direct_map; @@ -307,57 +339,117 @@ static void *execmem_cache_alloc(struct execmem_range *range, size_t size) return __execmem_cache_alloc(range, size); } -static bool execmem_cache_free(void *ptr) +static inline bool is_pending_free(void *ptr) { - struct maple_tree *busy_areas = &execmem_cache.busy_areas; - struct mutex *mutex = &execmem_cache.mutex; - unsigned long addr = (unsigned long)ptr; - MA_STATE(mas, busy_areas, addr, addr); - size_t size; - void *area; + return ((unsigned long)ptr & PENDING_FREE_MASK); +} - mutex_lock(mutex); - area = mas_walk(&mas); - if (!area) { - mutex_unlock(mutex); - return false; - } - size = mas_range_len(&mas); +static inline void *pending_free_set(void *ptr) +{ + return (void *)((unsigned long)ptr | PENDING_FREE_MASK); +} - mas_store_gfp(&mas, NULL, GFP_KERNEL); - mutex_unlock(mutex); +static inline void *pending_free_clear(void *ptr) +{ + return (void *)((unsigned long)ptr & ~PENDING_FREE_MASK); +} - execmem_fill_trapping_insns(ptr, size, /* writable = */ false); +static int __execmem_cache_free(struct ma_state *mas, void *ptr, gfp_t gfp_mask) +{ + size_t size = mas_range_len(mas); + int err; - execmem_cache_add(ptr, size); + err = execmem_force_rw(ptr, size); + if (err) + return err; - schedule_work(&execmem_cache_clean_work); + execmem_fill_trapping_insns(ptr, size); + execmem_restore_rox(ptr, size); - return true; + err = execmem_cache_add_locked(ptr, size, gfp_mask); + if (err) + return err; + + mas_store_gfp(mas, NULL, gfp_mask); + return 0; } -int execmem_make_temp_rw(void *ptr, size_t size) +static void execmem_cache_free_slow(struct work_struct *work); +static DECLARE_DELAYED_WORK(execmem_cache_free_work, execmem_cache_free_slow); + +static void execmem_cache_free_slow(struct work_struct *work) { - unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; - unsigned long addr = (unsigned long)ptr; - int ret; + struct maple_tree *busy_areas = &execmem_cache.busy_areas; + MA_STATE(mas, busy_areas, 0, ULONG_MAX); + void *area; - ret = set_memory_nx(addr, nr); - if (ret) - return ret; + guard(mutex)(&execmem_cache.mutex); - return set_memory_rw(addr, nr); + if (!execmem_cache.pending_free_cnt) + return; + + mas_for_each(&mas, area, ULONG_MAX) { + if (!is_pending_free(area)) + continue; + + area = pending_free_clear(area); + if (__execmem_cache_free(&mas, area, GFP_KERNEL)) + continue; + + execmem_cache.pending_free_cnt--; + } + + if (execmem_cache.pending_free_cnt) + schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); + else + schedule_work(&execmem_cache_clean_work); } -int execmem_restore_rox(void *ptr, size_t size) +static bool execmem_cache_free(void *ptr) { - unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct maple_tree *busy_areas = &execmem_cache.busy_areas; unsigned long addr = (unsigned long)ptr; + MA_STATE(mas, busy_areas, addr, addr); + void *area; + int err; - return set_memory_rox(addr, nr); + guard(mutex)(&execmem_cache.mutex); + + area = mas_walk(&mas); + if (!area) + return false; + + err = __execmem_cache_free(&mas, area, GFP_KERNEL | __GFP_NORETRY); + if (err) { + /* + * mas points to exact slot we've got the area from, nothing + * else can modify the tree because of the mutex, so there + * won't be any allocations in mas_store_gfp() and it will just + * change the pointer. + */ + area = pending_free_set(area); + mas_store_gfp(&mas, area, GFP_KERNEL); + execmem_cache.pending_free_cnt++; + schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); + return true; + } + + schedule_work(&execmem_cache_clean_work); + + return true; } #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ +/* + * when ROX cache is not used the permissions defined by architectures for + * execmem ranges that are updated before use (e.g. EXECMEM_MODULE_TEXT) must + * be writable anyway + */ +static inline int execmem_force_rw(void *ptr, size_t size) +{ + return 0; +} + static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { return NULL; @@ -373,9 +465,9 @@ void *execmem_alloc(enum execmem_type type, size_t size) { struct execmem_range *range = &execmem_info->ranges[type]; bool use_cache = range->flags & EXECMEM_ROX_CACHE; - vm_flags_t vm_flags = VM_FLUSH_RESET_PERMS; + unsigned long vm_flags = VM_FLUSH_RESET_PERMS; pgprot_t pgprot = range->pgprot; - void *p; + void *p = NULL; size = PAGE_ALIGN(size); @@ -387,6 +479,21 @@ void *execmem_alloc(enum execmem_type type, size_t size) return kasan_reset_tag(p); } +void *execmem_alloc_rw(enum execmem_type type, size_t size) +{ + void *p __free(execmem) = execmem_alloc(type, size); + int err; + + if (!p) + return NULL; + + err = execmem_force_rw(p, size); + if (err) + return NULL; + + return no_free_ptr(p); +} + void execmem_free(void *ptr) { /* @@ -399,11 +506,6 @@ void execmem_free(void *ptr) vfree(ptr); } -void *execmem_update_copy(void *dst, const void *src, size_t size) -{ - return text_poke_copy(dst, src, size); -} - bool execmem_is_rox(enum execmem_type type) { return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE); diff --git a/mm/internal.h b/mm/internal.h index 1da16d550a45..45b725c3dc03 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1391,7 +1391,7 @@ int migrate_device_coherent_folio(struct folio *folio); struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, - vm_flags_t vm_flags, unsigned long start, + unsigned long vm_flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller); diff --git a/mm/io-mapping.c b/mm/io-mapping.c deleted file mode 100644 index d3586e95c12c..000000000000 --- a/mm/io-mapping.c +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only - -#include <linux/mm.h> -#include <linux/io-mapping.h> - -/** - * io_mapping_map_user - remap an I/O mapping to userspace - * @iomap: the source io_mapping - * @vma: user vma to map to - * @addr: target user address to start at - * @pfn: physical address of kernel memory - * @size: size of map area - * - * Note: this is only safe if the mm semaphore is held when called. - */ -int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn, unsigned long size) -{ - vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; - - if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags)) - return -EINVAL; - - pgprot_t remap_prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) | - (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)); - - /* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */ - return remap_pfn_range_notrack(vma, addr, pfn, size, remap_prot); -} -EXPORT_SYMBOL_GPL(io_mapping_map_user); diff --git a/mm/kasan/common.c b/mm/kasan/common.c index ed4873e18c75..9142964ab9c9 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -230,16 +230,12 @@ static bool check_slab_allocation(struct kmem_cache *cache, void *object, } static inline void poison_slab_object(struct kmem_cache *cache, void *object, - bool init, bool still_accessible) + bool init) { void *tagged_object = object; object = kasan_reset_tag(object); - /* RCU slabs could be legally used after free within the RCU period. */ - if (unlikely(still_accessible)) - return; - kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), KASAN_SLAB_FREE, init); @@ -261,7 +257,22 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, if (!kasan_arch_is_ready() || is_kfence_address(object)) return false; - poison_slab_object(cache, object, init, still_accessible); + /* + * If this point is reached with an object that must still be + * accessible under RCU, we can't poison it; in that case, also skip the + * quarantine. This should mostly only happen when CONFIG_SLUB_RCU_DEBUG + * has been disabled manually. + * + * Putting the object on the quarantine wouldn't help catch UAFs (since + * we can't poison it here), and it would mask bugs caused by + * SLAB_TYPESAFE_BY_RCU users not being careful enough about object + * reuse; so overall, putting the object into the quarantine here would + * be counterproductive. + */ + if (still_accessible) + return false; + + poison_slab_object(cache, object, init); /* * If the object is put into quarantine, do not let slab put the object @@ -519,7 +530,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) if (check_slab_allocation(slab->slab_cache, ptr, ip)) return false; - poison_slab_object(slab->slab_cache, ptr, false, false); + poison_slab_object(slab->slab_cache, ptr, false); return true; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a55fb1dcd224..374a6a5193a7 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -700,12 +700,15 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, spinlock_t *ptl, struct list_head *compound_pagelist) { + unsigned long end = address + HPAGE_PMD_SIZE; struct folio *src, *tmp; - pte_t *_pte; pte_t pteval; + pte_t *_pte; + unsigned int nr_ptes; - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; - _pte++, address += PAGE_SIZE) { + for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes, + address += nr_ptes * PAGE_SIZE) { + nr_ptes = 1; pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); @@ -722,18 +725,26 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, struct page *src_page = pte_page(pteval); src = page_folio(src_page); - if (!folio_test_large(src)) + + if (folio_test_large(src)) { + unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT; + + nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes); + } else { release_pte_folio(src); + } + /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats * inside folio_remove_rmap_pte(). */ spin_lock(ptl); - ptep_clear(vma->vm_mm, address, _pte); - folio_remove_rmap_pte(src, src_page, vma); + clear_ptes(vma->vm_mm, address, _pte, nr_ptes); + folio_remove_rmap_ptes(src, src_page, nr_ptes, vma); spin_unlock(ptl); - free_folio_and_swap_cache(src); + free_swap_cache(src); + folio_put_refs(src, nr_ptes); } } @@ -1492,15 +1503,17 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd) { + int nr_mapped_ptes = 0, result = SCAN_FAIL; + unsigned int nr_batch_ptes; struct mmu_notifier_range range; bool notified = false; unsigned long haddr = addr & HPAGE_PMD_MASK; + unsigned long end = haddr + HPAGE_PMD_SIZE; struct vm_area_struct *vma = vma_lookup(mm, haddr); struct folio *folio; pte_t *start_pte, *pte; pmd_t *pmd, pgt_pmd; spinlock_t *pml = NULL, *ptl; - int nr_ptes = 0, result = SCAN_FAIL; int i; mmap_assert_locked(mm); @@ -1614,11 +1627,15 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, goto abort; /* step 2: clear page table and adjust rmap */ - for (i = 0, addr = haddr, pte = start_pte; - i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { + for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; + i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE, + pte += nr_batch_ptes) { + unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT; struct page *page; pte_t ptent = ptep_get(pte); + nr_batch_ptes = 1; + if (pte_none(ptent)) continue; /* @@ -1632,26 +1649,29 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, goto abort; } page = vm_normal_page(vma, addr, ptent); + if (folio_page(folio, i) != page) goto abort; + nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes); + /* * Must clear entry, or a racing truncate may re-remove it. * TLB flush can be left until pmdp_collapse_flush() does it. * PTE dirty? Shmem page is already dirty; file is read-only. */ - ptep_clear(mm, addr, pte); - folio_remove_rmap_pte(folio, page, vma); - nr_ptes++; + clear_ptes(mm, addr, pte, nr_batch_ptes); + folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma); + nr_mapped_ptes += nr_batch_ptes; } if (!pml) spin_unlock(ptl); /* step 3: set proper refcount and mm_counters. */ - if (nr_ptes) { - folio_ref_sub(folio, nr_ptes); - add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); + if (nr_mapped_ptes) { + folio_ref_sub(folio, nr_mapped_ptes); + add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); } /* step 4: remove empty page table */ @@ -1684,10 +1704,10 @@ maybe_install_pmd: : SCAN_SUCCEED; goto drop_folio; abort: - if (nr_ptes) { + if (nr_mapped_ptes) { flush_tlb_mm(mm); - folio_ref_sub(folio, nr_ptes); - add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); + folio_ref_sub(folio, nr_mapped_ptes); + add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); } unlock: if (start_pte) diff --git a/mm/madvise.c b/mm/madvise.c index bb80fc5ea08f..35ed4ab0d7c5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -19,6 +19,7 @@ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/mm_inline.h> +#include <linux/mmu_context.h> #include <linux/string.h> #include <linux/uio.h> #include <linux/ksm.h> @@ -1256,6 +1257,74 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior) &guard_remove_walk_ops, NULL); } +#ifdef CONFIG_64BIT +/* Does the madvise operation result in discarding of mapped data? */ +static bool is_discard(int behavior) +{ + switch (behavior) { + case MADV_FREE: + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + case MADV_REMOVE: + case MADV_DONTFORK: + case MADV_WIPEONFORK: + case MADV_GUARD_INSTALL: + return true; + } + + return false; +} + +/* + * We are restricted from madvise()'ing mseal()'d VMAs only in very particular + * circumstances - discarding of data from read-only anonymous SEALED mappings. + * + * This is because users cannot trivally discard data from these VMAs, and may + * only do so via an appropriate madvise() call. + */ +static bool can_madvise_modify(struct madvise_behavior *madv_behavior) +{ + struct vm_area_struct *vma = madv_behavior->vma; + + /* If the VMA isn't sealed we're good. */ + if (!vma_is_sealed(vma)) + return true; + + /* For a sealed VMA, we only care about discard operations. */ + if (!is_discard(madv_behavior->behavior)) + return true; + + /* + * We explicitly permit all file-backed mappings, whether MAP_SHARED or + * MAP_PRIVATE. + * + * The latter causes some complications. Because now, one can mmap() + * read/write a MAP_PRIVATE mapping, write to it, then mprotect() + * read-only, mseal() and a discard will be permitted. + * + * However, in order to avoid issues with potential use of madvise(..., + * MADV_DONTNEED) of mseal()'d .text mappings we, for the time being, + * permit this. + */ + if (!vma_is_anonymous(vma)) + return true; + + /* If the user could write to the mapping anyway, then this is fine. */ + if ((vma->vm_flags & VM_WRITE) && + arch_vma_access_permitted(vma, /* write= */ true, + /* execute= */ false, /* foreign= */ false)) + return true; + + /* Otherwise, we are not permitted to perform this operation. */ + return false; +} +#else +static bool can_madvise_modify(struct madvise_behavior *madv_behavior) +{ + return true; +} +#endif + /* * Apply an madvise behavior to a region of a vma. madvise_update_vma * will handle splitting a vm area into separate areas, each area with its own @@ -1269,7 +1338,7 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior) struct madvise_behavior_range *range = &madv_behavior->range; int error; - if (unlikely(!can_modify_vma_madv(madv_behavior->vma, behavior))) + if (unlikely(!can_madvise_modify(madv_behavior))) return -EPERM; switch (behavior) { diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3047b9ac667e..e2e685b971bb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -837,11 +837,17 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, struct mm_walk *walk) { struct hwpoison_walk *hwp = walk->private; - pte_t pte = huge_ptep_get(walk->mm, addr, ptep); struct hstate *h = hstate_vma(walk->vma); + spinlock_t *ptl; + pte_t pte; + int ret; - return check_hwpoisoned_entry(pte, addr, huge_page_shift(h), - hwp->pfn, &hwp->tk); + ptl = huge_pte_lock(h, walk->mm, ptep); + pte = huge_ptep_get(walk->mm, addr, ptep); + ret = check_hwpoisoned_entry(pte, addr, huge_page_shift(h), + hwp->pfn, &hwp->tk); + spin_unlock(ptl); + return ret; } #else #define hwpoison_hugetlb_range NULL diff --git a/mm/mempool.c b/mm/mempool.c index 204a216b6418..1c38e873e546 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -136,7 +136,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element) static __always_inline void add_element(mempool_t *pool, void *element) { - BUG_ON(pool->curr_nr >= pool->min_nr); + BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr); poison_element(pool, element); if (kasan_poison_element(pool, element)) pool->elements[pool->curr_nr++] = element; @@ -202,16 +202,20 @@ int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, pool->alloc = alloc_fn; pool->free = free_fn; init_waitqueue_head(&pool->wait); - - pool->elements = kmalloc_array_node(min_nr, sizeof(void *), + /* + * max() used here to ensure storage for at least 1 element to support + * zero minimum pool + */ + pool->elements = kmalloc_array_node(max(1, min_nr), sizeof(void *), gfp_mask, node_id); if (!pool->elements) return -ENOMEM; /* - * First pre-allocate the guaranteed number of buffers. + * First pre-allocate the guaranteed number of buffers, + * also pre-allocate 1 element for zero minimum pool. */ - while (pool->curr_nr < pool->min_nr) { + while (pool->curr_nr < max(1, pool->min_nr)) { void *element; element = pool->alloc(gfp_mask, pool->pool_data); @@ -555,20 +559,12 @@ void mempool_free(void *element, mempool_t *pool) * wake-up path of previous test. This explicit check ensures the * allocation of element when both min_nr and curr_nr are 0, and * any active waiters are properly awakened. - * - * Inline the same logic as previous test, add_element() cannot be - * directly used here since it has BUG_ON to deny if min_nr equals - * curr_nr, so here picked rest of add_element() to use without - * BUG_ON check. */ if (unlikely(pool->min_nr == 0 && READ_ONCE(pool->curr_nr) == 0)) { spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr == 0)) { - /* Inline the logic of add_element() */ - poison_element(pool, element); - if (kasan_poison_element(pool, element)) - pool->elements[pool->curr_nr++] = element; + add_element(pool, element); spin_unlock_irqrestore(&pool->lock, flags); if (wq_has_sleeper(&pool->wait)) wake_up(&pool->wait); diff --git a/mm/mincore.c b/mm/mincore.c index 42d6c9c8da86..10dabefc3acc 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -29,7 +29,9 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, #ifdef CONFIG_HUGETLB_PAGE unsigned char present; unsigned char *vec = walk->private; + spinlock_t *ptl; + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); /* * Hugepages under user process are always in RAM and never * swapped out, but theoretically it needs to be checked. @@ -38,6 +40,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, for (; addr != end; vec++, addr += PAGE_SIZE) *vec = present; walk->private = vec; + spin_unlock(ptl); #else BUG(); #endif diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 729fb7d0dd59..b006cec8e6fe 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -164,8 +164,7 @@ retry: */ /* Check if the vma we locked is the right one. */ - if (unlikely(vma->vm_mm != mm || - address < vma->vm_start || address >= vma->vm_end)) + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto inval_end_read; rcu_read_unlock(); @@ -236,11 +235,8 @@ retry: goto fallback; } - /* - * Verify the vma we locked belongs to the same address space and it's - * not behind of the last search position. - */ - if (unlikely(vma->vm_mm != mm || from_addr >= vma->vm_end)) + /* Verify the vma is not behind the last search position. */ + if (unlikely(from_addr >= vma->vm_end)) goto fallback_unlock; /* diff --git a/mm/mprotect.c b/mm/mprotect.c index 2ddd37b2f462..78bded7acf79 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -766,7 +766,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, unsigned long charged = 0; int error; - if (!can_modify_vma(vma)) + if (vma_is_sealed(vma)) return -EPERM; if (newflags == oldflags) { diff --git a/mm/mremap.c b/mm/mremap.c index e15cf2e444c7..677a4d744df9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -280,7 +280,7 @@ static int move_ptes(struct pagetable_move_control *pmc, old_pte, max_nr_ptes); force_flush = true; } - pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr_ptes, 0); + pte = get_and_clear_ptes(mm, old_addr, old_ptep, nr_ptes); pte = move_pte(pte, old_addr, new_addr); pte = move_soft_dirty_pte(pte); @@ -1651,7 +1651,7 @@ static int check_prep_vma(struct vma_remap_struct *vrm) return -EFAULT; /* If mseal()'d, mremap() is prohibited. */ - if (!can_modify_vma(vma)) + if (vma_is_sealed(vma)) return -EPERM; /* Align to hugetlb page size, if required. */ diff --git a/mm/mseal.c b/mm/mseal.c index c27197ac04e8..e5b205562d2e 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -11,148 +11,74 @@ #include <linux/mman.h> #include <linux/mm.h> #include <linux/mm_inline.h> -#include <linux/mmu_context.h> #include <linux/syscalls.h> #include <linux/sched.h> #include "internal.h" -static inline void set_vma_sealed(struct vm_area_struct *vma) -{ - vm_flags_set(vma, VM_SEALED); -} - -static bool is_madv_discard(int behavior) -{ - switch (behavior) { - case MADV_FREE: - case MADV_DONTNEED: - case MADV_DONTNEED_LOCKED: - case MADV_REMOVE: - case MADV_DONTFORK: - case MADV_WIPEONFORK: - case MADV_GUARD_INSTALL: - return true; - } - - return false; -} - -static bool is_ro_anon(struct vm_area_struct *vma) -{ - /* check anonymous mapping. */ - if (vma->vm_file || vma->vm_flags & VM_SHARED) - return false; - - /* - * check for non-writable: - * PROT=RO or PKRU is not writeable. - */ - if (!(vma->vm_flags & VM_WRITE) || - !arch_vma_access_permitted(vma, true, false, false)) - return true; - - return false; -} - /* - * Check if a vma is allowed to be modified by madvise. + * mseal() disallows an input range which contain unmapped ranges (VMA holes). + * + * It disallows unmapped regions from start to end whether they exist at the + * start, in the middle, or at the end of the range, or any combination thereof. + * + * This is because after sealng a range, there's nothing to stop memory mapping + * of ranges in the remaining gaps later, meaning that the user might then + * wrongly consider the entirety of the mseal()'d range to be sealed when it + * in fact isn't. */ -bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) -{ - if (!is_madv_discard(behavior)) - return true; - - if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma))) - return false; - - /* Allow by default. */ - return true; -} - -static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, - struct vm_area_struct **prev, unsigned long start, - unsigned long end, vm_flags_t newflags) -{ - int ret = 0; - vm_flags_t oldflags = vma->vm_flags; - - if (newflags == oldflags) - goto out; - - vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); - goto out; - } - - set_vma_sealed(vma); -out: - *prev = vma; - return ret; -} /* - * Check for do_mseal: - * 1> start is part of a valid vma. - * 2> end is part of a valid vma. - * 3> No gap (unallocated address) between start and end. - * 4> map is sealable. + * Does the [start, end) range contain any unmapped memory? + * + * We ensure that: + * - start is part of a valid VMA. + * - end is part of a valid VMA. + * - no gap (unallocated memory) exists between start and end. */ -static int check_mm_seal(unsigned long start, unsigned long end) +static bool range_contains_unmapped(struct mm_struct *mm, + unsigned long start, unsigned long end) { struct vm_area_struct *vma; - unsigned long nstart = start; - + unsigned long prev_end = start; VMA_ITERATOR(vmi, current->mm, start); - /* going through each vma to check. */ for_each_vma_range(vmi, vma, end) { - if (vma->vm_start > nstart) - /* unallocated memory found. */ - return -ENOMEM; - - if (vma->vm_end >= end) - return 0; + if (vma->vm_start > prev_end) + return true; - nstart = vma->vm_end; + prev_end = vma->vm_end; } - return -ENOMEM; + return prev_end < end; } -/* - * Apply sealing. - */ -static int apply_mm_seal(unsigned long start, unsigned long end) +static int mseal_apply(struct mm_struct *mm, + unsigned long start, unsigned long end) { - unsigned long nstart; struct vm_area_struct *vma, *prev; + unsigned long curr_start = start; + VMA_ITERATOR(vmi, mm, start); - VMA_ITERATOR(vmi, current->mm, start); - + /* We know there are no gaps so this will be non-NULL. */ vma = vma_iter_load(&vmi); - /* - * Note: check_mm_seal should already checked ENOMEM case. - * so vma should not be null, same for the other ENOMEM cases. - */ prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; - nstart = start; for_each_vma_range(vmi, vma, end) { - int error; - unsigned long tmp; - vm_flags_t newflags; + unsigned long curr_end = MIN(vma->vm_end, end); - newflags = vma->vm_flags | VM_SEALED; - tmp = vma->vm_end; - if (tmp > end) - tmp = end; - error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); - if (error) - return error; - nstart = vma_iter_end(&vmi); + if (!(vma->vm_flags & VM_SEALED)) { + vma = vma_modify_flags(&vmi, prev, vma, + curr_start, curr_end, + vma->vm_flags | VM_SEALED); + if (IS_ERR(vma)) + return PTR_ERR(vma); + vm_flags_set(vma, VM_SEALED); + } + + prev = vma; + curr_start = curr_end; } return 0; @@ -240,14 +166,10 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags) if (mmap_write_lock_killable(mm)) return -EINTR; - /* - * First pass, this helps to avoid - * partial sealing in case of error in input address range, - * e.g. ENOMEM error. - */ - ret = check_mm_seal(start, end); - if (ret) + if (range_contains_unmapped(mm, start, end)) { + ret = -ENOMEM; goto out; + } /* * Second pass, this should success, unless there are errors @@ -255,10 +177,10 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags) * reaching the max supported VMAs, however, those cases shall * be rare. */ - ret = apply_mm_seal(start, end); + ret = mseal_apply(mm, start, end); out: - mmap_write_unlock(current->mm); + mmap_write_unlock(mm); return ret; } diff --git a/mm/nommu.c b/mm/nommu.c index 736d0e0f0618..8b819fafd57b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -126,7 +126,7 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, - pgprot_t prot, vm_flags_t vm_flags, int node, + pgprot_t prot, unsigned long vm_flags, int node, const void *caller) { return __vmalloc_noprof(size, gfp_mask); diff --git a/mm/rmap.c b/mm/rmap.c index f93ce27132ab..568198e9efc2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2036,7 +2036,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, flush_cache_range(vma, address, end_addr); /* Nuke the page table entry. */ - pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0); + pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages); /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. diff --git a/mm/shmem.c b/mm/shmem.c index 7fdd707ac1ac..e2c76a30802b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -512,15 +512,27 @@ static int shmem_replace_entry(struct address_space *mapping, /* * Sometimes, before we decide whether to proceed or to fail, we must check - * that an entry was not already brought back from swap by a racing thread. + * that an entry was not already brought back or split by a racing thread. * * Checking folio is not enough: by the time a swapcache folio is locked, it * might be reused, and again be swapcache, using the same swap as before. + * Returns the swap entry's order if it still presents, else returns -1. */ -static bool shmem_confirm_swap(struct address_space *mapping, - pgoff_t index, swp_entry_t swap) +static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, + swp_entry_t swap) { - return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); + XA_STATE(xas, &mapping->i_pages, index); + int ret = -1; + void *entry; + + rcu_read_lock(); + do { + entry = xas_load(&xas); + if (entry == swp_to_radix_entry(swap)) + ret = xas_get_order(&xas); + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + return ret; } /* @@ -891,7 +903,9 @@ static int shmem_add_to_page_cache(struct folio *folio, pgoff_t index, void *expected, gfp_t gfp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); - long nr = folio_nr_pages(folio); + unsigned long nr = folio_nr_pages(folio); + swp_entry_t iter, swap; + void *entry; VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -903,14 +917,25 @@ static int shmem_add_to_page_cache(struct folio *folio, gfp &= GFP_RECLAIM_MASK; folio_throttle_swaprate(folio, gfp); + swap = radix_to_swp_entry(expected); do { + iter = swap; xas_lock_irq(&xas); - if (expected != xas_find_conflict(&xas)) { - xas_set_err(&xas, -EEXIST); - goto unlock; + xas_for_each_conflict(&xas, entry) { + /* + * The range must either be empty, or filled with + * expected swap entries. Shmem swap entries are never + * partially freed without split of both entry and + * folio, so there shouldn't be any holes. + */ + if (!expected || entry != swp_to_radix_entry(iter)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + iter.val += 1 << xas_get_order(&xas); } - if (expected && xas_find_conflict(&xas)) { + if (expected && iter.val - nr != swap.val) { xas_set_err(&xas, -EEXIST); goto unlock; } @@ -1992,30 +2017,47 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, swp_entry_t entry, int order, gfp_t gfp) { struct shmem_inode_info *info = SHMEM_I(inode); + int nr_pages = 1 << order; struct folio *new; + gfp_t alloc_gfp; void *shadow; - int nr_pages; /* * We have arrived here because our zones are constrained, so don't * limit chance of success with further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) { - gfp_t huge_gfp = vma_thp_gfp_mask(vma); + alloc_gfp = gfp; + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (WARN_ON_ONCE(order)) + return ERR_PTR(-EINVAL); + } else if (order) { + /* + * If uffd is active for the vma, we need per-page fault + * fidelity to maintain the uffd semantics, then fallback + * to swapin order-0 folio, as well as for zswap case. + * Any existing sub folio in the swap cache also blocks + * mTHP swapin. + */ + if ((vma && unlikely(userfaultfd_armed(vma))) || + !zswap_never_enabled() || + non_swapcache_batch(entry, nr_pages) != nr_pages) + goto fallback; - gfp = limit_gfp_mask(huge_gfp, gfp); + alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); + } +retry: + new = shmem_alloc_folio(alloc_gfp, order, info, index); + if (!new) { + new = ERR_PTR(-ENOMEM); + goto fallback; } - new = shmem_alloc_folio(gfp, order, info, index); - if (!new) - return ERR_PTR(-ENOMEM); - - nr_pages = folio_nr_pages(new); if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, - gfp, entry)) { + alloc_gfp, entry)) { folio_put(new); - return ERR_PTR(-ENOMEM); + new = ERR_PTR(-ENOMEM); + goto fallback; } /* @@ -2030,7 +2072,9 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, */ if (swapcache_prepare(entry, nr_pages)) { folio_put(new); - return ERR_PTR(-EEXIST); + new = ERR_PTR(-EEXIST); + /* Try smaller folio to avoid cache conflict */ + goto fallback; } __folio_set_locked(new); @@ -2044,6 +2088,15 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode, folio_add_lru(new); swap_read_folio(new, NULL); return new; +fallback: + /* Order 0 swapin failed, nothing to fallback to, abort */ + if (!order) + return new; + entry.val += index - round_down(index, nr_pages); + alloc_gfp = gfp; + nr_pages = 1; + order = 0; + goto retry; } /* @@ -2249,7 +2302,7 @@ unlock: if (xas_error(&xas)) return xas_error(&xas); - return entry_order; + return 0; } /* @@ -2266,133 +2319,109 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); + swp_entry_t swap, index_entry; struct swap_info_struct *si; struct folio *folio = NULL; bool skip_swapcache = false; - swp_entry_t swap; - int error, nr_pages, order, split_order; + int error, nr_pages, order; + pgoff_t offset; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); - swap = radix_to_swp_entry(*foliop); + index_entry = radix_to_swp_entry(*foliop); + swap = index_entry; *foliop = NULL; - if (is_poisoned_swp_entry(swap)) + if (is_poisoned_swp_entry(index_entry)) return -EIO; - si = get_swap_device(swap); - if (!si) { - if (!shmem_confirm_swap(mapping, index, swap)) + si = get_swap_device(index_entry); + order = shmem_confirm_swap(mapping, index, index_entry); + if (unlikely(!si)) { + if (order < 0) return -EEXIST; else return -EINVAL; } + if (unlikely(order < 0)) { + put_swap_device(si); + return -EEXIST; + } + + /* index may point to the middle of a large entry, get the sub entry */ + if (order) { + offset = index - round_down(index, 1 << order); + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); - order = xa_get_order(&mapping->i_pages, index); if (!folio) { - int nr_pages = 1 << order; - bool fallback_order0 = false; - - /* Or update major stats only when swapin succeeds?? */ + if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { + /* Direct swapin skipping swap cache & readahead */ + folio = shmem_swap_alloc_folio(inode, vma, index, + index_entry, order, gfp); + if (IS_ERR(folio)) { + error = PTR_ERR(folio); + folio = NULL; + goto failed; + } + skip_swapcache = true; + } else { + /* Cached swapin only supports order 0 folio */ + folio = shmem_swapin_cluster(swap, gfp, info, index); + if (!folio) { + error = -ENOMEM; + goto failed; + } + } if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(fault_mm, PGMAJFAULT); } + } + if (order > folio_order(folio)) { /* - * If uffd is active for the vma, we need per-page fault - * fidelity to maintain the uffd semantics, then fallback - * to swapin order-0 folio, as well as for zswap case. - * Any existing sub folio in the swap cache also blocks - * mTHP swapin. - */ - if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || - !zswap_never_enabled() || - non_swapcache_batch(swap, nr_pages) != nr_pages)) - fallback_order0 = true; - - /* Skip swapcache for synchronous device. */ - if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) { - folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp); - if (!IS_ERR(folio)) { - skip_swapcache = true; - goto alloced; - } - - /* - * Fallback to swapin order-0 folio unless the swap entry - * already exists. - */ - error = PTR_ERR(folio); - folio = NULL; - if (error == -EEXIST) - goto failed; - } - - /* - * Now swap device can only swap in order 0 folio, then we - * should split the large swap entry stored in the pagecache - * if necessary. - */ - split_order = shmem_split_large_entry(inode, index, swap, gfp); - if (split_order < 0) { - error = split_order; - goto failed; - } - - /* - * If the large swap entry has already been split, it is - * necessary to recalculate the new swap entry based on - * the old order alignment. - */ - if (split_order > 0) { - pgoff_t offset = index - round_down(index, 1 << split_order); - - swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); - } - - /* Here we actually start the io */ - folio = shmem_swapin_cluster(swap, gfp, info, index); - if (!folio) { - error = -ENOMEM; - goto failed; - } - } else if (order != folio_order(folio)) { - /* - * Swap readahead may swap in order 0 folios into swapcache + * Swapin may get smaller folios due to various reasons: + * It may fallback to order 0 due to memory pressure or race, + * swap readahead may swap in order 0 folios into swapcache * asynchronously, while the shmem mapping can still stores * large swap entries. In such cases, we should split the * large swap entry to prevent possible data corruption. */ - split_order = shmem_split_large_entry(inode, index, swap, gfp); - if (split_order < 0) { - folio_put(folio); - folio = NULL; - error = split_order; - goto failed; - } - - /* - * If the large swap entry has already been split, it is - * necessary to recalculate the new swap entry based on - * the old order alignment. - */ - if (split_order > 0) { - pgoff_t offset = index - round_down(index, 1 << split_order); + error = shmem_split_large_entry(inode, index, index_entry, gfp); + if (error) + goto failed_nolock; + } - swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); - } + /* + * If the folio is large, round down swap and index by folio size. + * No matter what race occurs, the swap layer ensures we either get + * a valid folio that has its swap entry aligned by size, or a + * temporarily invalid one which we'll abort very soon and retry. + * + * shmem_add_to_page_cache ensures the whole range contains expected + * entries and prevents any corruption, so any race split is fine + * too, it will succeed as long as the entries are still there. + */ + nr_pages = folio_nr_pages(folio); + if (nr_pages > 1) { + swap.val = round_down(swap.val, nr_pages); + index = round_down(index, nr_pages); } -alloced: - /* We have to do this with folio locked to prevent races */ + /* + * We have to do this with the folio locked to prevent races. + * The shmem_confirm_swap below only checks if the first swap + * entry matches the folio, that's enough to ensure the folio + * is not used outside of shmem, as shmem swap entries + * and swap cache folios are never partially freed. + */ folio_lock(folio); if ((!skip_swapcache && !folio_test_swapcache(folio)) || - folio->swap.val != swap.val || - !shmem_confirm_swap(mapping, index, swap) || - xa_get_order(&mapping->i_pages, index) != folio_order(folio)) { + shmem_confirm_swap(mapping, index, swap) < 0 || + folio->swap.val != swap.val) { error = -EEXIST; goto unlock; } @@ -2415,8 +2444,7 @@ alloced: goto failed; } - error = shmem_add_to_page_cache(folio, mapping, - round_down(index, nr_pages), + error = shmem_add_to_page_cache(folio, mapping, index, swp_to_radix_entry(swap), gfp); if (error) goto failed; @@ -2439,18 +2467,19 @@ alloced: *foliop = folio; return 0; failed: - if (!shmem_confirm_swap(mapping, index, swap)) + if (shmem_confirm_swap(mapping, index, swap) < 0) error = -EEXIST; if (error == -EIO) shmem_set_folio_swapin_error(inode, index, folio, swap, skip_swapcache); unlock: - if (skip_swapcache) - swapcache_clear(si, swap, folio_nr_pages(folio)); - if (folio) { + if (folio) folio_unlock(folio); +failed_nolock: + if (skip_swapcache) + swapcache_clear(si, folio->swap, folio_nr_pages(folio)); + if (folio) folio_put(folio); - } put_swap_device(si); return error; @@ -5960,8 +5989,8 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, struct folio *folio; int error; - error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE, - gfp, NULL, NULL); + error = shmem_get_folio_gfp(inode, index, i_size_read(inode), + &folio, SGP_CACHE, gfp, NULL, NULL); if (error) return ERR_PTR(error); @@ -1351,7 +1351,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, } /* Don't bother splitting the VMA if we can't unmap it anyway */ - if (!can_modify_vma(vms->vma)) { + if (vma_is_sealed(vms->vma)) { error = -EPERM; goto start_split_failed; } @@ -1371,7 +1371,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, for_each_vma_range(*(vms->vmi), next, vms->end) { long nrpages; - if (!can_modify_vma(next)) { + if (vma_is_sealed(next)) { error = -EPERM; goto modify_vma_failed; } @@ -559,38 +559,15 @@ struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi, } #ifdef CONFIG_64BIT - static inline bool vma_is_sealed(struct vm_area_struct *vma) { return (vma->vm_flags & VM_SEALED); } - -/* - * check if a vma is sealed for modification. - * return true, if modification is allowed. - */ -static inline bool can_modify_vma(struct vm_area_struct *vma) -{ - if (unlikely(vma_is_sealed(vma))) - return false; - - return true; -} - -bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior); - #else - -static inline bool can_modify_vma(struct vm_area_struct *vma) -{ - return true; -} - -static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) +static inline bool vma_is_sealed(struct vm_area_struct *vma) { - return true; + return false; } - #endif #if defined(CONFIG_STACK_GROWSUP) diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c index 632ab44737ec..c952640f163b 100644 --- a/tools/testing/selftests/cachestat/test_cachestat.c +++ b/tools/testing/selftests/cachestat/test_cachestat.c @@ -33,6 +33,11 @@ void print_cachestat(struct cachestat *cs) cs->nr_evicted, cs->nr_recently_evicted); } +enum file_type { + FILE_MMAP, + FILE_SHMEM +}; + bool write_exactly(int fd, size_t filesize) { int random_fd = open("/dev/urandom", O_RDONLY); @@ -201,8 +206,20 @@ out1: out: return ret; } +const char *file_type_str(enum file_type type) +{ + switch (type) { + case FILE_SHMEM: + return "shmem"; + case FILE_MMAP: + return "mmap"; + default: + return "unknown"; + } +} -bool test_cachestat_shmem(void) + +bool run_cachestat_test(enum file_type type) { size_t PS = sysconf(_SC_PAGESIZE); size_t filesize = PS * 512 * 2; /* 2 2MB huge pages */ @@ -212,27 +229,50 @@ bool test_cachestat_shmem(void) char *filename = "tmpshmcstat"; struct cachestat cs; bool ret = true; + int fd; unsigned long num_pages = compute_len / PS; - int fd = shm_open(filename, O_CREAT | O_RDWR, 0600); + if (type == FILE_SHMEM) + fd = shm_open(filename, O_CREAT | O_RDWR, 0600); + else + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0666); if (fd < 0) { - ksft_print_msg("Unable to create shmem file.\n"); + ksft_print_msg("Unable to create %s file.\n", + file_type_str(type)); ret = false; goto out; } if (ftruncate(fd, filesize)) { - ksft_print_msg("Unable to truncate shmem file.\n"); + ksft_print_msg("Unable to truncate %s file.\n",file_type_str(type)); ret = false; goto close_fd; } + switch (type) { + case FILE_SHMEM: + if (!write_exactly(fd, filesize)) { + ksft_print_msg("Unable to write to file.\n"); + ret = false; + goto close_fd; + } + break; + case FILE_MMAP: + char *map = mmap(NULL, filesize, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); - if (!write_exactly(fd, filesize)) { - ksft_print_msg("Unable to write to shmem file.\n"); + if (map == MAP_FAILED) { + ksft_print_msg("mmap failed.\n"); + ret = false; + goto close_fd; + } + for (int i = 0; i < filesize; i++) + map[i] = 'A'; + break; + default: + ksft_print_msg("Unsupported file type.\n"); ret = false; goto close_fd; } - syscall_ret = syscall(__NR_cachestat, fd, &cs_range, &cs, 0); if (syscall_ret) { @@ -308,12 +348,18 @@ int main(void) break; } - if (test_cachestat_shmem()) + if (run_cachestat_test(FILE_SHMEM)) ksft_test_result_pass("cachestat works with a shmem file\n"); else { ksft_test_result_fail("cachestat fails with a shmem file\n"); ret = 1; } + if (run_cachestat_test(FILE_MMAP)) + ksft_test_result_pass("cachestat works with a mmap file\n"); + else { + ksft_test_result_fail("cachestat fails with a mmap file\n"); + ret = 1; + } return ret; } diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index f2dafa0b700b..e7b23a8a05fe 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -21,6 +21,7 @@ on-fault-limit transhuge-stress pagemap_ioctl pfnmap +process_madv *.tmp* protection_keys protection_keys_32 diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index ae6f994d3add..d13b3cef2a2b 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -85,6 +85,7 @@ TEST_GEN_FILES += mseal_test TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += pagemap_ioctl TEST_GEN_FILES += pfnmap +TEST_GEN_FILES += process_madv TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += uffd-stress diff --git a/tools/testing/selftests/mm/process_madv.c b/tools/testing/selftests/mm/process_madv.c new file mode 100644 index 000000000000..471cae8427f1 --- /dev/null +++ b/tools/testing/selftests/mm/process_madv.c @@ -0,0 +1,344 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#define _GNU_SOURCE +#include "../kselftest_harness.h" +#include <errno.h> +#include <setjmp.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <linux/mman.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <sched.h> +#include "vm_util.h" + +#include "../pidfd/pidfd.h" + +FIXTURE(process_madvise) +{ + unsigned long page_size; + pid_t child_pid; + int remote_pidfd; + int pidfd; +}; + +FIXTURE_SETUP(process_madvise) +{ + self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); + self->pidfd = PIDFD_SELF; + self->remote_pidfd = -1; + self->child_pid = -1; +}; + +FIXTURE_TEARDOWN_PARENT(process_madvise) +{ + /* This teardown is guaranteed to run, even if tests SKIP or ASSERT */ + if (self->child_pid > 0) { + kill(self->child_pid, SIGKILL); + waitpid(self->child_pid, NULL, 0); + } + + if (self->remote_pidfd >= 0) + close(self->remote_pidfd); +} + +static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec, + size_t vlen, int advice, unsigned int flags) +{ + return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags); +} + +/* + * This test uses PIDFD_SELF to target the current process. The main + * goal is to verify the basic behavior of process_madvise() with + * a vector of non-contiguous memory ranges, not its cross-process + * capabilities. + */ +TEST_F(process_madvise, basic) +{ + const unsigned long pagesize = self->page_size; + const int madvise_pages = 4; + struct iovec vec[madvise_pages]; + int pidfd = self->pidfd; + ssize_t ret; + char *map; + + /* + * Create a single large mapping. We will pick pages from this + * mapping to advise on. This ensures we test non-contiguous iovecs. + */ + map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + /* Fill the entire region with a known pattern. */ + memset(map, 'A', pagesize * 10); + + /* + * Setup the iovec to point to 4 non-contiguous pages + * within the mapping. + */ + vec[0].iov_base = &map[0 * pagesize]; + vec[0].iov_len = pagesize; + vec[1].iov_base = &map[3 * pagesize]; + vec[1].iov_len = pagesize; + vec[2].iov_base = &map[5 * pagesize]; + vec[2].iov_len = pagesize; + vec[3].iov_base = &map[8 * pagesize]; + vec[3].iov_len = pagesize; + + ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0); + if (ret == -1 && errno == EPERM) + SKIP(return, + "process_madvise() unsupported or permission denied, try running as root.\n"); + else if (errno == EINVAL) + SKIP(return, + "process_madvise() unsupported or parameter invalid, please check arguments.\n"); + + /* The call should succeed and report the total bytes processed. */ + ASSERT_EQ(ret, madvise_pages * pagesize); + + /* Check that advised pages are now zero. */ + for (int i = 0; i < madvise_pages; i++) { + char *advised_page = (char *)vec[i].iov_base; + + /* Content must be 0, not 'A'. */ + ASSERT_EQ(*advised_page, '\0'); + } + + /* Check that an un-advised page in between is still 'A'. */ + char *unadvised_page = &map[1 * pagesize]; + + for (int i = 0; i < pagesize; i++) + ASSERT_EQ(unadvised_page[i], 'A'); + + /* Cleanup. */ + ASSERT_EQ(munmap(map, pagesize * 10), 0); +} + +/* + * This test deterministically validates process_madvise() with MADV_COLLAPSE + * on a remote process, other advices are difficult to verify reliably. + * + * The test verifies that a memory region in a child process, + * focus on process_madv remote result, only check addresses and lengths. + * The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged. + */ +TEST_F(process_madvise, remote_collapse) +{ + const unsigned long pagesize = self->page_size; + long huge_page_size; + int pipe_info[2]; + ssize_t ret; + struct iovec vec; + + struct child_info { + pid_t pid; + void *map_addr; + } info; + + huge_page_size = read_pmd_pagesize(); + if (huge_page_size <= 0) + SKIP(return, "Could not determine a valid huge page size.\n"); + + ASSERT_EQ(pipe(pipe_info), 0); + + self->child_pid = fork(); + ASSERT_NE(self->child_pid, -1); + + if (self->child_pid == 0) { + char *map; + size_t map_size = 2 * huge_page_size; + + close(pipe_info[0]); + + map = mmap(NULL, map_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(map, MAP_FAILED); + + /* Fault in as small pages */ + for (size_t i = 0; i < map_size; i += pagesize) + map[i] = 'A'; + + /* Send info and pause */ + info.pid = getpid(); + info.map_addr = map; + ret = write(pipe_info[1], &info, sizeof(info)); + ASSERT_EQ(ret, sizeof(info)); + close(pipe_info[1]); + + pause(); + exit(0); + } + + close(pipe_info[1]); + + /* Receive child info */ + ret = read(pipe_info[0], &info, sizeof(info)); + if (ret <= 0) { + waitpid(self->child_pid, NULL, 0); + SKIP(return, "Failed to read child info from pipe.\n"); + } + ASSERT_EQ(ret, sizeof(info)); + close(pipe_info[0]); + self->child_pid = info.pid; + + self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); + ASSERT_GE(self->remote_pidfd, 0); + + vec.iov_base = info.map_addr; + vec.iov_len = huge_page_size; + + ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE, + 0); + if (ret == -1) { + if (errno == EINVAL) + SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n"); + else if (errno == EPERM) + SKIP(return, + "No process_madvise() permissions, try running as root.\n"); + return; + } + + ASSERT_EQ(ret, huge_page_size); +} + +/* + * Test process_madvise() with a pidfd for a process that has already + * exited to ensure correct error handling. + */ +TEST_F(process_madvise, exited_process_pidfd) +{ + const unsigned long pagesize = self->page_size; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + /* + * Using a pidfd for a process that has already exited should fail + * with ESRCH. + */ + self->child_pid = fork(); + ASSERT_NE(self->child_pid, -1); + + if (self->child_pid == 0) + exit(0); + + self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); + ASSERT_GE(self->remote_pidfd, 0); + + /* Wait for the child to ensure it has terminated. */ + waitpid(self->child_pid, NULL, 0); + + ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_DONTNEED, + 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, ESRCH); +} + +/* + * Test process_madvise() with bad pidfds to ensure correct error + * handling. + */ +TEST_F(process_madvise, bad_pidfd) +{ + const unsigned long pagesize = self->page_size; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + /* Using an invalid fd number (-1) should fail with EBADF. */ + ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EBADF); + + /* + * Using a valid fd that is not a pidfd (e.g. stdin) should fail + * with EBADF. + */ + ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EBADF); +} + +/* + * Test that process_madvise() rejects vlen > UIO_MAXIOV. + * The kernel should return -EINVAL when the number of iovecs exceeds 1024. + */ +TEST_F(process_madvise, invalid_vlen) +{ + const unsigned long pagesize = self->page_size; + int pidfd = self->pidfd; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + ret = sys_process_madvise(pidfd, &vec, 1025, MADV_DONTNEED, 0); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); + + /* Cleanup. */ + ASSERT_EQ(munmap(map, pagesize), 0); +} + +/* + * Test process_madvise() with an invalid flag value. Currently, only a flag + * value of 0 is supported. This test is reserved for the future, e.g., if + * synchronous flags are added. + */ +TEST_F(process_madvise, flag) +{ + const unsigned long pagesize = self->page_size; + unsigned int invalid_flag; + int pidfd = self->pidfd; + struct iovec vec; + char *map; + ssize_t ret; + + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, + 0); + if (map == MAP_FAILED) + SKIP(return, "mmap failed, not enough memory.\n"); + + vec.iov_base = map; + vec.iov_len = pagesize; + + invalid_flag = 0x80000000; + + ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag); + ASSERT_EQ(ret, -1); + ASSERT_EQ(errno, EINVAL); + + /* Cleanup. */ + ASSERT_EQ(munmap(map, pagesize), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index a38c984103ce..471e539d82b8 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -65,6 +65,8 @@ separated by spaces: test pagemap_scan IOCTL - pfnmap tests for VM_PFNMAP handling +- process_madv + test for process_madv - cow test copy-on-write semantics - thp @@ -425,6 +427,9 @@ CATEGORY="madv_guard" run_test ./guard-regions # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests CATEGORY="madv_populate" run_test ./madv_populate +# PROCESS_MADV test +CATEGORY="process_madv" run_test ./process_madv + CATEGORY="vma_merge" run_test ./merge if [ -x ./memfd_secret ] diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index a838c37f93e5..3639aa8dd2b0 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -108,8 +108,10 @@ extern unsigned long dac_mmap_min_addr; #define CAP_IPC_LOCK 14 #ifdef CONFIG_64BIT -/* VM is sealed, in vm_flags */ -#define VM_SEALED _BITUL(63) +#define VM_SEALED_BIT 42 +#define VM_SEALED BIT(VM_SEALED_BIT) +#else +#define VM_SEALED VM_NONE #endif #define FIRST_USER_ADDRESS 0UL |