From 800977f6f32e452cba6b04ef21d2f5383ca29209 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:02:52 -0800 Subject: kthread: add the helper function kthread_run_on_cpu() Add a new helper function kthread_run_on_cpu(), which includes kthread_create_on_cpu/wake_up_process(). In some cases, use kthread_run_on_cpu() directly instead of kthread_create_on_node/kthread_bind/wake_up_process() or kthread_create_on_cpu/wake_up_process() or kthreadd_create/kthread_bind/wake_up_process() to simplify the code. [akpm@linux-foundation.org: export kthread_create_on_cpu to modules] Link: https://lkml.kernel.org/r/20211022025711.3673-2-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Cai Huoqing Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 7113003fab63..4ed9e7bce9e8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -523,6 +523,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), to_kthread(p)->cpu = cpu; return p; } +EXPORT_SYMBOL(kthread_create_on_cpu); void kthread_set_per_cpu(struct task_struct *k, int cpu) { -- cgit v1.2.3-59-g8ed1b From 64ed3a049e3e81b801e7c5bb052416152443f585 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:02:59 -0800 Subject: ring-buffer: make use of the helper function kthread_run_on_cpu() Replace kthread_create/kthread_bind/wake_up_process() with kthread_run_on_cpu() to simplify the code. Link: https://lkml.kernel.org/r/20211022025711.3673-4-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/ring_buffer.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2699e9e562b1..05dfc7a12d3d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5898,16 +5898,13 @@ static __init int test_ringbuffer(void) rb_data[cpu].buffer = buffer; rb_data[cpu].cpu = cpu; rb_data[cpu].cnt = cpu; - rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu], - "rbtester/%d", cpu); + rb_threads[cpu] = kthread_run_on_cpu(rb_test, &rb_data[cpu], + cpu, "rbtester/%u"); if (WARN_ON(IS_ERR(rb_threads[cpu]))) { pr_cont("FAILED\n"); ret = PTR_ERR(rb_threads[cpu]); goto out_free; } - - kthread_bind(rb_threads[cpu], cpu); - wake_up_process(rb_threads[cpu]); } /* Now create the rb hammer! */ -- cgit v1.2.3-59-g8ed1b From 3b9cb4ba4b54ecc6cf7d04ea9085d2ad2be48733 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:03:02 -0800 Subject: rcutorture: make use of the helper function kthread_run_on_cpu() Replace kthread_create_on_node/kthread_bind/wake_up_process() with kthread_run_on_cpu() to simplify the code. Link: https://lkml.kernel.org/r/20211022025711.3673-5-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcu/rcutorture.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8b410d982990..42bc66a2f170 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1992,9 +1992,8 @@ static int rcutorture_booster_init(unsigned int cpu) mutex_lock(&boost_mutex); rcu_torture_disable_rt_throttle(); VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task"); - boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, - cpu_to_node(cpu), - "rcu_torture_boost"); + boost_tasks[cpu] = kthread_run_on_cpu(rcu_torture_boost, NULL, + cpu, "rcu_torture_boost_%u"); if (IS_ERR(boost_tasks[cpu])) { retval = PTR_ERR(boost_tasks[cpu]); VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed"); @@ -2003,8 +2002,6 @@ static int rcutorture_booster_init(unsigned int cpu) mutex_unlock(&boost_mutex); return retval; } - kthread_bind(boost_tasks[cpu], cpu); - wake_up_process(boost_tasks[cpu]); mutex_unlock(&boost_mutex); return 0; } -- cgit v1.2.3-59-g8ed1b From 11e4e3523da98c065a6c249013ace0d388e41c25 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:03:06 -0800 Subject: trace/osnoise: make use of the helper function kthread_run_on_cpu() Replace kthread_create_on_cpu/wake_up_process() with kthread_run_on_cpu() to simplify the code. Link: https://lkml.kernel.org/r/20211022025711.3673-6-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/trace_osnoise.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 7520d43aed55..89d6cbac6f10 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1701,7 +1701,7 @@ static int start_kthread(unsigned int cpu) snprintf(comm, 24, "osnoise/%d", cpu); } - kthread = kthread_create_on_cpu(main, NULL, cpu, comm); + kthread = kthread_run_on_cpu(main, NULL, cpu, comm); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); @@ -1710,7 +1710,6 @@ static int start_kthread(unsigned int cpu) } per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread; - wake_up_process(kthread); return 0; } -- cgit v1.2.3-59-g8ed1b From ff78f6679d2e223e073fcbdc8f70b6bc0abadf99 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:03:10 -0800 Subject: trace/hwlat: make use of the helper function kthread_run_on_cpu() Replace kthread_create_on_cpu/wake_up_process() with kthread_run_on_cpu() to simplify the code. Link: https://lkml.kernel.org/r/20211022025711.3673-7-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/trace_hwlat.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 56bb7b890578..d440ddd5fd8b 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -491,18 +491,14 @@ static void stop_per_cpu_kthreads(void) static int start_cpu_kthread(unsigned int cpu) { struct task_struct *kthread; - char comm[24]; - snprintf(comm, 24, "hwlatd/%d", cpu); - - kthread = kthread_create_on_cpu(kthread_fn, NULL, cpu, comm); + kthread = kthread_run_on_cpu(kthread_fn, NULL, cpu, "hwlatd/%u"); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); return -ENOMEM; } per_cpu(hwlat_per_cpu_data, cpu).kthread = kthread; - wake_up_process(kthread); return 0; } -- cgit v1.2.3-59-g8ed1b From 9a10064f5625d5572c3626c1516e0bebc6c9fe9b Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Fri, 14 Jan 2022 14:05:59 -0800 Subject: mm: add a field to store names for private anonymous memory In many userspace applications, and especially in VM based applications like Android uses heavily, there are multiple different allocators in use. At a minimum there is libc malloc and the stack, and in many cases there are libc malloc, the stack, direct syscalls to mmap anonymous memory, and multiple VM heaps (one for small objects, one for big objects, etc.). Each of these layers usually has its own tools to inspect its usage; malloc by compiling a debug version, the VM through heap inspection tools, and for direct syscalls there is usually no way to track them. On Android we heavily use a set of tools that use an extended version of the logic covered in Documentation/vm/pagemap.txt to walk all pages mapped in userspace and slice their usage by process, shared (COW) vs. unique mappings, backing, etc. This can account for real physical memory usage even in cases like fork without exec (which Android uses heavily to share as many private COW pages as possible between processes), Kernel SamePage Merging, and clean zero pages. It produces a measurement of the pages that only exist in that process (USS, for unique), and a measurement of the physical memory usage of that process with the cost of shared pages being evenly split between processes that share them (PSS). If all anonymous memory is indistinguishable then figuring out the real physical memory usage (PSS) of each heap requires either a pagemap walking tool that can understand the heap debugging of every layer, or for every layer's heap debugging tools to implement the pagemap walking logic, in which case it is hard to get a consistent view of memory across the whole system. Tracking the information in userspace leads to all sorts of problems. It either needs to be stored inside the process, which means every process has to have an API to export its current heap information upon request, or it has to be stored externally in a filesystem that somebody needs to clean up on crashes. It needs to be readable while the process is still running, so it has to have some sort of synchronization with every layer of userspace. Efficiently tracking the ranges requires reimplementing something like the kernel vma trees, and linking to it from every layer of userspace. It requires more memory, more syscalls, more runtime cost, and more complexity to separately track regions that the kernel is already tracking. This patch adds a field to /proc/pid/maps and /proc/pid/smaps to show a userspace-provided name for anonymous vmas. The names of named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps as [anon:]. Userspace can set the name for a region of memory by calling prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name) Setting the name to NULL clears it. The name length limit is 80 bytes including NUL-terminator and is checked to contain only printable ascii characters (including space), except '[',']','\','$' and '`'. Ascii strings are being used to have a descriptive identifiers for vmas, which can be understood by the users reading /proc/pid/maps or /proc/pid/smaps. Names can be standardized for a given system and they can include some variable parts such as the name of the allocator or a library, tid of the thread using it, etc. The name is stored in a pointer in the shared union in vm_area_struct that points to a null terminated string. Anonymous vmas with the same name (equivalent strings) and are otherwise mergeable will be merged. The name pointers are not shared between vmas even if they contain the same name. The name pointer is stored in a union with fields that are only used on file-backed mappings, so it does not increase memory usage. CONFIG_ANON_VMA_NAME kernel configuration is introduced to enable this feature. It keeps the feature disabled by default to prevent any additional memory overhead and to avoid confusing procfs parsers on systems which are not ready to support named anonymous vmas. The patch is based on the original patch developed by Colin Cross, more specifically on its latest version [1] posted upstream by Sumit Semwal. It used a userspace pointer to store vma names. In that design, name pointers could be shared between vmas. However during the last upstreaming attempt, Kees Cook raised concerns [2] about this approach and suggested to copy the name into kernel memory space, perform validity checks [3] and store as a string referenced from vm_area_struct. One big concern is about fork() performance which would need to strdup anonymous vma names. Dave Hansen suggested experimenting with worst-case scenario of forking a process with 64k vmas having longest possible names [4]. I ran this experiment on an ARM64 Android device and recorded a worst-case regression of almost 40% when forking such a process. This regression is addressed in the followup patch which replaces the pointer to a name with a refcounted structure that allows sharing the name pointer between vmas of the same name. Instead of duplicating the string during fork() or when splitting a vma it increments the refcount. [1] https://lore.kernel.org/linux-mm/20200901161459.11772-4-sumit.semwal@linaro.org/ [2] https://lore.kernel.org/linux-mm/202009031031.D32EF57ED@keescook/ [3] https://lore.kernel.org/linux-mm/202009031022.3834F692@keescook/ [4] https://lore.kernel.org/linux-mm/5d0358ab-8c47-2f5f-8e43-23b89d6a8e95@intel.com/ Changes for prctl(2) manual page (in the options section): PR_SET_VMA Sets an attribute specified in arg2 for virtual memory areas starting from the address specified in arg3 and spanning the size specified in arg4. arg5 specifies the value of the attribute to be set. Note that assigning an attribute to a virtual memory area might prevent it from being merged with adjacent virtual memory areas due to the difference in that attribute's value. Currently, arg2 must be one of: PR_SET_VMA_ANON_NAME Set a name for anonymous virtual memory areas. arg5 should be a pointer to a null-terminated string containing the name. The name length including null byte cannot exceed 80 bytes. If arg5 is NULL, the name of the appropriate anonymous virtual memory areas will be reset. The name can contain only printable ascii characters (including space), except '[',']','\','$' and '`'. This feature is available only if the kernel is built with the CONFIG_ANON_VMA_NAME option enabled. [surenb@google.com: docs: proc.rst: /proc/PID/maps: fix malformed table] Link: https://lkml.kernel.org/r/20211123185928.2513763-1-surenb@google.com [surenb: rebased over v5.15-rc6, replaced userpointer with a kernel copy, added input sanitization and CONFIG_ANON_VMA_NAME config. The bulk of the work here was done by Colin Cross, therefore, with his permission, keeping him as the author] Link: https://lkml.kernel.org/r/20211019215511.3771969-2-surenb@google.com Signed-off-by: Colin Cross Signed-off-by: Suren Baghdasaryan Reviewed-by: Kees Cook Cc: Stephen Rothwell Cc: Al Viro Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: David Rientjes Cc: "Eric W. Biederman" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Glauber Cc: Johannes Weiner Cc: John Stultz Cc: Mel Gorman Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Rob Landley Cc: "Serge E. Hallyn" Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.rst | 6 +- fs/proc/task_mmu.c | 12 +++- fs/userfaultfd.c | 7 +- include/linux/mm.h | 13 +++- include/linux/mm_types.h | 64 ++++++++++++++++-- include/uapi/linux/prctl.h | 3 + kernel/fork.c | 2 + kernel/sys.c | 63 ++++++++++++++++++ mm/Kconfig | 14 ++++ mm/madvise.c | 129 +++++++++++++++++++++++++++++++++++-- mm/mempolicy.c | 3 +- mm/mlock.c | 2 +- mm/mmap.c | 38 ++++++----- mm/mprotect.c | 2 +- 14 files changed, 324 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 8d7f141c6fc7..061744c436d9 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -426,12 +426,14 @@ with the memory region, as the case would be with BSS (uninitialized data). The "pathname" shows the name associated file for this mapping. If the mapping is not associated with a file: - ======= ==================================== + ============= ==================================== [heap] the heap of the program [stack] the stack of the main process [vdso] the "virtual dynamic shared object", the kernel system call handler - ======= ==================================== + [anon:] an anonymous mapping that has been + named by userspace + ============= ==================================== or if empty, the mapping is anonymous. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ad667dbc96f5..e6998652fd67 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -308,6 +308,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) name = arch_vma_name(vma); if (!name) { + const char *anon_name; + if (!mm) { name = "[vdso]"; goto done; @@ -319,8 +321,16 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - if (is_stack(vma)) + if (is_stack(vma)) { name = "[stack]"; + goto done; + } + + anon_name = vma_anon_name(vma); + if (anon_name) { + seq_pad(m, ' '); + seq_printf(m, "[anon:%s]", anon_name); + } } done: diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 22bf14ab2d16..5b2af7b82776 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -877,7 +877,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, vma_anon_name(vma)); if (prev) vma = prev; else @@ -1436,7 +1436,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - ((struct vm_userfaultfd_ctx){ ctx })); + ((struct vm_userfaultfd_ctx){ ctx }), + vma_anon_name(vma)); if (prev) { vma = prev; goto next; @@ -1613,7 +1614,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, vma_anon_name(vma)); if (prev) { vma = prev; goto next; diff --git a/include/linux/mm.h b/include/linux/mm.h index a7e4a9e7d807..7000442984b9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2658,7 +2658,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx); + struct mempolicy *, struct vm_userfaultfd_ctx, const char *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); @@ -3391,5 +3391,16 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) return 0; } +#ifdef CONFIG_ANON_VMA_NAME +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name); +#else +static inline int +madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name) { + return 0; +} +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c3a6e6209600..799e2ee626b2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -426,11 +426,19 @@ struct vm_area_struct { /* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. + * + * For private anonymous mappings, a pointer to a null terminated string + * containing the name given to the vma, or NULL if unnamed. */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; + + union { + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; + /* Serialized by mmap_sem. */ + char *anon_name; + }; /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma @@ -875,4 +883,52 @@ typedef struct { unsigned long val; } swp_entry_t; +#ifdef CONFIG_ANON_VMA_NAME +/* + * mmap_lock should be read-locked when calling vma_anon_name() and while using + * the returned pointer. + */ +extern const char *vma_anon_name(struct vm_area_struct *vma); + +/* + * mmap_lock should be read-locked for orig_vma->vm_mm. + * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be + * isolated. + */ +extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma); + +/* + * mmap_lock should be write-locked or vma should have been isolated under + * write-locked mmap_lock protection. + */ +extern void free_vma_anon_name(struct vm_area_struct *vma); + +/* mmap_lock should be read-locked */ +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + const char *vma_name = vma_anon_name(vma); + + /* either both NULL, or pointers to same string */ + if (vma_name == name) + return true; + + return name && vma_name && !strcmp(name, vma_name); +} +#else /* CONFIG_ANON_VMA_NAME */ +static inline const char *vma_anon_name(struct vm_area_struct *vma) +{ + return NULL; +} +static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_vma_anon_name(struct vm_area_struct *vma) {} +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + return true; +} +#endif /* CONFIG_ANON_VMA_NAME */ + #endif /* _LINUX_MM_TYPES_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index bb73e9a0b24f..e998764f0262 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -272,4 +272,7 @@ struct prctl_mm_map { # define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1 # define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2 +#define PR_SET_VMA 0x53564d41 +# define PR_SET_VMA_ANON_NAME 0 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 3244cc56b697..4cf20b5f2da3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -365,12 +365,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; + dup_vma_anon_name(orig, new); } return new; } void vm_area_free(struct vm_area_struct *vma) { + free_vma_anon_name(vma); kmem_cache_free(vm_area_cachep, vma); } diff --git a/kernel/sys.c b/kernel/sys.c index 8fdac0d90504..2450a9f33cb0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2261,6 +2261,66 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) +#ifdef CONFIG_ANON_VMA_NAME + +#define ANON_VMA_NAME_MAX_LEN 80 +#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" + +static inline bool is_valid_name_char(char ch) +{ + /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ + return ch > 0x1f && ch < 0x7f && + !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); +} + +static int prctl_set_vma(unsigned long opt, unsigned long addr, + unsigned long size, unsigned long arg) +{ + struct mm_struct *mm = current->mm; + const char __user *uname; + char *name, *pch; + int error; + + switch (opt) { + case PR_SET_VMA_ANON_NAME: + uname = (const char __user *)arg; + if (uname) { + name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); + + if (IS_ERR(name)) + return PTR_ERR(name); + + for (pch = name; *pch != '\0'; pch++) { + if (!is_valid_name_char(*pch)) { + kfree(name); + return -EINVAL; + } + } + } else { + /* Reset the name */ + name = NULL; + } + + mmap_write_lock(mm); + error = madvise_set_anon_name(mm, addr, size, name); + mmap_write_unlock(mm); + kfree(name); + break; + default: + error = -EINVAL; + } + + return error; +} + +#else /* CONFIG_ANON_VMA_NAME */ +static int prctl_set_vma(unsigned long opt, unsigned long start, + unsigned long size, unsigned long arg) +{ + return -EINVAL; +} +#endif /* CONFIG_ANON_VMA_NAME */ + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2530,6 +2590,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = sched_core_share_pid(arg2, arg3, arg4, arg5); break; #endif + case PR_SET_VMA: + error = prctl_set_vma(arg2, arg3, arg4, arg5); + break; default: error = -EINVAL; break; diff --git a/mm/Kconfig b/mm/Kconfig index 356f4f2c779e..53d7485fc38f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -900,6 +900,20 @@ config IO_MAPPING config SECRETMEM def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED +config ANON_VMA_NAME + bool "Anonymous VMA name support" + depends on PROC_FS && ADVISE_SYSCALLS && MMU + + help + Allow naming anonymous virtual memory areas. + + This feature allows assigning names to virtual memory areas. Assigned + names can be later retrieved from /proc/pid/maps and /proc/pid/smaps + and help identifying individual anonymous memory areas. + Assigning a name to anonymous virtual memory area might prevent that + area from being merged with adjacent virtual memory areas due to the + difference in their name. + source "mm/damon/Kconfig" endmenu diff --git a/mm/madvise.c b/mm/madvise.c index 4b9c5509990c..413bbc6e40a0 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -62,19 +63,84 @@ static int madvise_need_mmap_write(int behavior) } } +#ifdef CONFIG_ANON_VMA_NAME +static inline bool has_vma_anon_name(struct vm_area_struct *vma) +{ + return !vma->vm_file && vma->anon_name; +} + +const char *vma_anon_name(struct vm_area_struct *vma) +{ + if (!has_vma_anon_name(vma)) + return NULL; + + mmap_assert_locked(vma->vm_mm); + + return vma->anon_name; +} + +void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) +{ + if (!has_vma_anon_name(orig_vma)) + return; + + new_vma->anon_name = kstrdup(orig_vma->anon_name, GFP_KERNEL); +} + +void free_vma_anon_name(struct vm_area_struct *vma) +{ + if (!has_vma_anon_name(vma)) + return; + + kfree(vma->anon_name); + vma->anon_name = NULL; +} + +/* mmap_lock should be write-locked */ +static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +{ + if (!name) { + free_vma_anon_name(vma); + return 0; + } + + if (vma->anon_name) { + /* Same name, nothing to do here */ + if (!strcmp(name, vma->anon_name)) + return 0; + + free_vma_anon_name(vma); + } + vma->anon_name = kstrdup(name, GFP_KERNEL); + if (!vma->anon_name) + return -ENOMEM; + + return 0; +} +#else /* CONFIG_ANON_VMA_NAME */ +static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +{ + if (name) + return -EINVAL; + + return 0; +} +#endif /* CONFIG_ANON_VMA_NAME */ /* * Update the vm_flags on region of a vma, splitting it or merging it as * necessary. Must be called with mmap_sem held for writing; */ static int madvise_update_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long new_flags) + unsigned long end, unsigned long new_flags, + const char *name) { struct mm_struct *mm = vma->vm_mm; int error; pgoff_t pgoff; - if (new_flags == vma->vm_flags) { + if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) { *prev = vma; return 0; } @@ -82,7 +148,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, name); if (*prev) { vma = *prev; goto success; @@ -111,6 +177,11 @@ success: * vm_flags is protected by the mmap_lock held in write mode. */ vma->vm_flags = new_flags; + if (!vma->vm_file) { + error = replace_vma_anon_name(vma, name); + if (error) + return error; + } return 0; } @@ -938,7 +1009,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, break; } - error = madvise_update_vma(vma, prev, start, end, new_flags); + error = madvise_update_vma(vma, prev, start, end, new_flags, + vma_anon_name(vma)); out: /* @@ -1118,6 +1190,55 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, return unmapped_error; } +#ifdef CONFIG_ANON_VMA_NAME +static int madvise_vma_anon_name(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + unsigned long name) +{ + int error; + + /* Only anonymous mappings can be named */ + if (vma->vm_file) + return -EBADF; + + error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, + (const char *)name); + + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name) +{ + unsigned long end; + unsigned long len; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + return madvise_walk_vmas(mm, start, end, (unsigned long)name, + madvise_vma_anon_name); +} +#endif /* CONFIG_ANON_VMA_NAME */ /* * The madvise(2) system call. * diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f6248affaf38..679f47b3a079 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -810,7 +810,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx); + new_pol, vma->vm_userfaultfd_ctx, + vma_anon_name(vma)); if (prev) { vma = prev; next = vma->vm_next; diff --git a/mm/mlock.c b/mm/mlock.c index e263d62ae2d0..8f584eddd305 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -512,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (*prev) { vma = *prev; goto success; diff --git a/mm/mmap.c b/mm/mmap.c index bfb0ea164a90..85edb0011453 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1029,7 +1029,8 @@ again: */ static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -1047,6 +1048,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) return 0; + if (!is_same_vma_anon_name(vma, anon_name)) + return 0; return 1; } @@ -1079,9 +1082,10 @@ static int can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return 1; @@ -1100,9 +1104,10 @@ static int can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); @@ -1113,9 +1118,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, } /* - * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out - * whether that can be merged with its predecessor or its successor. - * Or both (it neatly fills a hole). + * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), + * figure out whether that can be merged with its predecessor or its + * successor. Or both (it neatly fills a hole). * * In most cases - when called for mmap, brk or mremap - [addr,end) is * certain not to be mapped by the time vma_merge is called; but when @@ -1160,7 +1165,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -1190,7 +1196,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, anon_name)) { /* * OK, it can. Can we now merge in the successor as well? */ @@ -1199,7 +1205,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx) && + vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ @@ -1222,7 +1228,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, anon_name)) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL, next); @@ -1754,7 +1760,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * Can we just expand an old mapping? */ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -1803,7 +1809,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, */ if (unlikely(vm_flags != vma->vm_flags && prev)) { merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, - NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (merge) { /* ->mmap() can change vma->vm_file and fput the original file. So * fput the vma->vm_file here or we would add an extra fput for file @@ -3056,7 +3062,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla /* Can we just expand an old private anonymous mapping? */ vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -3249,7 +3255,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/mprotect.c b/mm/mprotect.c index e552f5e0ccbd..0138dfcdb1d8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (*pprev) { vma = *pprev; VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); -- cgit v1.2.3-59-g8ed1b From 17fca131cee21724ee953a17c185c14e9533af5b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 14 Jan 2022 14:06:07 -0800 Subject: mm: move anon_vma declarations to linux/mm_inline.h The patch to add anonymous vma names causes a build failure in some configurations: include/linux/mm_types.h: In function 'is_same_vma_anon_name': include/linux/mm_types.h:924:37: error: implicit declaration of function 'strcmp' [-Werror=implicit-function-declaration] 924 | return name && vma_name && !strcmp(name, vma_name); | ^~~~~~ include/linux/mm_types.h:22:1: note: 'strcmp' is defined in header ''; did you forget to '#include '? This should not really be part of linux/mm_types.h in the first place, as that header is meant to only contain structure defintions and need a minimum set of indirect includes itself. While the header clearly includes more than it should at this point, let's not make it worse by including string.h as well, which would pull in the expensive (compile-speed wise) fortify-string logic. Move the new functions into a separate header that only needs to be included in a couple of locations. Link: https://lkml.kernel.org/r/20211207125710.2503446-1-arnd@kernel.org Fixes: "mm: add a field to store names for private anonymous memory" Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Colin Cross Cc: Eric Biederman Cc: Kees Cook Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Stephen Rothwell Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 1 + fs/userfaultfd.c | 1 + include/linux/mm_inline.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 48 --------------------------------------------- kernel/fork.c | 1 + mm/madvise.c | 1 + mm/mmap.c | 1 + 7 files changed, 55 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e6998652fd67..18f8c3acbb85 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5b2af7b82776..e26b10132d47 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index e2ec68b0515c..47d96d2647ca 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -4,6 +4,7 @@ #include #include +#include /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? @@ -135,4 +136,53 @@ static __always_inline void del_page_from_lru_list(struct page *page, { lruvec_del_folio(lruvec, page_folio(page)); } + +#ifdef CONFIG_ANON_VMA_NAME +/* + * mmap_lock should be read-locked when calling vma_anon_name() and while using + * the returned pointer. + */ +extern const char *vma_anon_name(struct vm_area_struct *vma); + +/* + * mmap_lock should be read-locked for orig_vma->vm_mm. + * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be + * isolated. + */ +extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma); + +/* + * mmap_lock should be write-locked or vma should have been isolated under + * write-locked mmap_lock protection. + */ +extern void free_vma_anon_name(struct vm_area_struct *vma); + +/* mmap_lock should be read-locked */ +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + const char *vma_name = vma_anon_name(vma); + + /* either both NULL, or pointers to same string */ + if (vma_name == name) + return true; + + return name && vma_name && !strcmp(name, vma_name); +} +#else /* CONFIG_ANON_VMA_NAME */ +static inline const char *vma_anon_name(struct vm_area_struct *vma) +{ + return NULL; +} +static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_vma_anon_name(struct vm_area_struct *vma) {} +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + return true; +} +#endif /* CONFIG_ANON_VMA_NAME */ + #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 449b6eafc695..4d5fb84eed5e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -890,52 +890,4 @@ typedef struct { unsigned long val; } swp_entry_t; -#ifdef CONFIG_ANON_VMA_NAME -/* - * mmap_lock should be read-locked when calling vma_anon_name() and while using - * the returned pointer. - */ -extern const char *vma_anon_name(struct vm_area_struct *vma); - -/* - * mmap_lock should be read-locked for orig_vma->vm_mm. - * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be - * isolated. - */ -extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma); - -/* - * mmap_lock should be write-locked or vma should have been isolated under - * write-locked mmap_lock protection. - */ -extern void free_vma_anon_name(struct vm_area_struct *vma); - -/* mmap_lock should be read-locked */ -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) -{ - const char *vma_name = vma_anon_name(vma); - - /* either both NULL, or pointers to same string */ - if (vma_name == name) - return true; - - return name && vma_name && !strcmp(name, vma_name); -} -#else /* CONFIG_ANON_VMA_NAME */ -static inline const char *vma_anon_name(struct vm_area_struct *vma) -{ - return NULL; -} -static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma) {} -static inline void free_vma_anon_name(struct vm_area_struct *vma) {} -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) -{ - return true; -} -#endif /* CONFIG_ANON_VMA_NAME */ - #endif /* _LINUX_MM_TYPES_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 4cf20b5f2da3..75737e566441 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/madvise.c b/mm/madvise.c index c63aacbbfa78..5604064df464 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/mmap.c b/mm/mmap.c index 85edb0011453..77733b113c40 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-59-g8ed1b From 39c65a94cd9661532be150e88f8b02f4a6844a35 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 14 Jan 2022 14:07:17 -0800 Subject: mm/pagealloc: sysctl: change watermark_scale_factor max limit to 30% For embedded systems with low total memory, having to run applications with relatively large memory requirements, 10% max limitation for watermark_scale_factor poses an issue of triggering direct reclaim every time such application is started. This results in slow application startup times and bad end-user experience. By increasing watermark_scale_factor max limit we allow vendors more flexibility to choose the right level of kswapd aggressiveness for their device and workload requirements. Link: https://lkml.kernel.org/r/20211124193604.2758863-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Lukas Middendorf Cc: Antti Palosaari Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Dave Hansen Cc: Vlastimil Babka Cc: Mel Gorman Cc: Jonathan Corbet Cc: Zhang Yi Cc: Fengfei Xi Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/vm.rst | 2 +- kernel/sysctl.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 5e795202111f..f4804ce37c58 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -948,7 +948,7 @@ how much memory needs to be free before kswapd goes back to sleep. The unit is in fractions of 10,000. The default value of 10 means the distances between watermarks are 0.1% of the available memory in the -node/system. The maximum value is 1000, or 10% of memory. +node/system. The maximum value is 3000, or 30% of memory. A high rate of threads entering direct reclaim (allocstall) or kswapd going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 083be6af29d7..2ab4edb6e450 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -122,6 +122,7 @@ static unsigned long long_max = LONG_MAX; static int one_hundred = 100; static int two_hundred = 200; static int one_thousand = 1000; +static int three_thousand = 3000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif @@ -2959,7 +2960,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = watermark_scale_factor_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = &one_thousand, + .extra2 = &three_thousand, }, { .procname = "percpu_pagelist_high_fraction", -- cgit v1.2.3-59-g8ed1b From a674e48c5443d12a8a43c3ac42367aa39505d506 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 14 Jan 2022 14:07:41 -0800 Subject: dma/pool: create dma atomic pool only if dma zone has managed pages Currently three dma atomic pools are initialized as long as the relevant kernel codes are built in. While in kdump kernel of x86_64, this is not right when trying to create atomic_pool_dma, because there's no managed pages in DMA zone. In the case, DMA zone only has low 1M memory presented and locked down by memblock allocator. So no pages are added into buddy of DMA zone. Please check commit f1d4d47c5851 ("x86/setup: Always reserve the first 1M of RAM"). Then in kdump kernel of x86_64, it always prints below failure message: DMA: preallocated 128 KiB GFP_KERNEL pool for atomic allocations swapper/0: page allocation failure: order:5, mode:0xcc1(GFP_KERNEL|GFP_DMA), nodemask=(null),cpuset=/,mems_allowed=0 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-0.rc5.20210611git929d931f2b40.42.fc35.x86_64 #1 Hardware name: Dell Inc. PowerEdge R910/0P658H, BIOS 2.12.0 06/04/2018 Call Trace: dump_stack+0x7f/0xa1 warn_alloc.cold+0x72/0xd6 __alloc_pages_slowpath.constprop.0+0xf29/0xf50 __alloc_pages+0x24d/0x2c0 alloc_page_interleave+0x13/0xb0 atomic_pool_expand+0x118/0x210 __dma_atomic_pool_init+0x45/0x93 dma_atomic_pool_init+0xdb/0x176 do_one_initcall+0x67/0x320 kernel_init_freeable+0x290/0x2dc kernel_init+0xa/0x111 ret_from_fork+0x22/0x30 Mem-Info: ...... DMA: failed to allocate 128 KiB GFP_KERNEL|GFP_DMA pool for atomic allocation DMA: preallocated 128 KiB GFP_KERNEL|GFP_DMA32 pool for atomic allocations Here, let's check if DMA zone has managed pages, then create atomic_pool_dma if yes. Otherwise just skip it. Link: https://lkml.kernel.org/r/20211223094435.248523-3-bhe@redhat.com Fixes: 6f599d84231f ("x86/kdump: Always reserve the low 1M when the crashkernel option is specified") Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Acked-by: John Donnelly Reviewed-by: David Hildenbrand Cc: Marek Szyprowski Cc: Robin Murphy Cc: Borislav Petkov Cc: Christoph Lameter Cc: David Laight Cc: David Rientjes Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/dma/pool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 5f84e6cdb78e..4d40dcce7604 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -203,7 +203,7 @@ static int __init dma_atomic_pool_init(void) GFP_KERNEL); if (!atomic_pool_kernel) ret = -ENOMEM; - if (IS_ENABLED(CONFIG_ZONE_DMA)) { + if (has_managed_dma()) { atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size, GFP_KERNEL | GFP_DMA); if (!atomic_pool_dma) @@ -226,7 +226,7 @@ static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp) if (prev == NULL) { if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) return atomic_pool_dma32; - if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) + if (atomic_pool_dma && (gfp & GFP_DMA)) return atomic_pool_dma; return atomic_pool_kernel; } -- cgit v1.2.3-59-g8ed1b From 21b084fdf2a49ca1634e8e360e9ab6f9ff0dee11 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 14 Jan 2022 14:08:21 -0800 Subject: mm/mempolicy: wire up syscall set_mempolicy_home_node Link: https://lkml.kernel.org/r/20211202123810.267175-4-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 ++ arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + include/linux/syscalls.h | 3 +++ include/uapi/asm-generic/unistd.h | 5 ++++- kernel/sys_ni.c | 1 + 21 files changed, 27 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index ca5a32228cd6..3515bc4f16a4 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -489,3 +489,4 @@ # 557 reserved for memfd_secret 558 common process_mrelease sys_process_mrelease 559 common futex_waitv sys_futex_waitv +560 common set_mempolicy_home_node sys_ni_syscall diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 543100151f2b..ac964612d8b0 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -463,3 +463,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 6bdb5f5db438..4e65da3445c7 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 450 +#define __NR_compat_syscalls 451 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 41ea1195e44b..604a2053d006 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -905,6 +905,8 @@ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) __SYSCALL(__NR_process_mrelease, sys_process_mrelease) #define __NR_futex_waitv 449 __SYSCALL(__NR_futex_waitv, sys_futex_waitv) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 707ae121f6d3..78b1d03e86e1 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -370,3 +370,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 45bc32a41b90..b1f3940bc298 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -449,3 +449,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 2204bde3ce4a..820145e47350 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -455,3 +455,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 72d02d363f36..253ff994ed2e 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -388,3 +388,4 @@ # 447 reserved for memfd_secret 448 n32 process_mrelease sys_process_mrelease 449 n32 futex_waitv sys_futex_waitv +450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index e2c481fcede6..3f1886ad9d80 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -364,3 +364,4 @@ # 447 reserved for memfd_secret 448 n64 process_mrelease sys_process_mrelease 449 n64 futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 3714c97b2643..8f243e35a7b2 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -437,3 +437,4 @@ # 447 reserved for memfd_secret 448 o32 process_mrelease sys_process_mrelease 449 o32 futex_waitv sys_futex_waitv +450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 358c00000755..68b46fe2f17c 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -447,3 +447,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 15109af9d075..2600b4237292 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -529,3 +529,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index ed9c5c2eafad..799147658dee 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -452,3 +452,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index d9539d28bdaa..2de85c977f54 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -452,3 +452,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 46adabcb1720..4398cc6fb68d 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -495,3 +495,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 7e25543693de..320480a8db4f 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -454,3 +454,4 @@ 447 i386 memfd_secret sys_memfd_secret 448 i386 process_mrelease sys_process_mrelease 449 i386 futex_waitv sys_futex_waitv +450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index fe8f8dd157b4..c84d12608cd2 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -371,6 +371,7 @@ 447 common memfd_secret sys_memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 3e3e1a506bed..52c94ab5c205 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -420,3 +420,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 528a478dbda8..819c0cb00b6d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1057,6 +1057,9 @@ asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type ru const void __user *rule_attr, __u32 flags); asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags); asmlinkage long sys_memfd_secret(unsigned int flags); +asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, + unsigned long home_node, + unsigned long flags); /* * Architecture-specific system calls diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 4557a8b6086f..1c48b0ae3ba3 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -883,8 +883,11 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease) #define __NR_futex_waitv 449 __SYSCALL(__NR_futex_waitv, sys_futex_waitv) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) + #undef __NR_syscalls -#define __NR_syscalls 450 +#define __NR_syscalls 451 /* * 32 bit systems traditionally used different diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index d1944258cfc0..a492f159624f 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -297,6 +297,7 @@ COND_SYSCALL(get_mempolicy); COND_SYSCALL(set_mempolicy); COND_SYSCALL(migrate_pages); COND_SYSCALL(move_pages); +COND_SYSCALL(set_mempolicy_home_node); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); -- cgit v1.2.3-59-g8ed1b