From 3ca55ca225d7627e76fcfe2894740c4f615ff2e0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 2 Apr 2025 11:09:25 -0700 Subject: exit: move and extend sched_process_exit() tracepoint It is useful to be able to access current->mm at task exit to, say, record a bunch of VMA information right before the task exits (e.g., for stack symbolization reasons when dealing with short-lived processes that exit in the middle of profiling session). Currently, trace_sched_process_exit() is triggered after exit_mm() which resets current->mm to NULL making this tracepoint unsuitable for inspecting and recording task's mm_struct-related data when tracing process lifetimes. There is a particularly suitable place, though, right after taskstats_exit() is called, but before we do exit_mm() and other exit_*() resource teardowns. taskstats performs a similar kind of accounting that some applications do with BPF, and so co-locating them seems like a good fit. So that's where trace_sched_process_exit() is moved with this patch. Also, existing trace_sched_process_exit() tracepoint is notoriously missing `group_dead` flag that is certainly useful in practice and some of our production applications have to work around this. So plumb `group_dead` through while at it, to have a richer and more complete tracepoint. Note that we can't use sched_process_template anymore, and so we use TRACE_EVENT()-based tracepoint definition. But all the field names and order, as well as assign and output logic remain intact. We just add one extra field at the end in backwards-compatible way. [andrii@kernel.org: document sched_process_exit and sched_process_template relation] Link: https://lkml.kernel.org/r/20250403174120.4087794-1-andrii@kernel.org Link: https://lkml.kernel.org/r/20250402180925.90914-1-andrii@kernel.org Signed-off-by: Andrii Nakryiko Acked-by: Steven Rostedt (Google) Acked-by: Oleg Nesterov Suggested-by: Ingo Molnar Cc: Alexander Potapenko Cc: Christian Brauner Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/trace/events/sched.h | 34 ++++++++++++++++++++++++++++++---- kernel/exit.c | 2 +- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 8994e97d86c1..3bec9fb73a36 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -326,11 +326,37 @@ DEFINE_EVENT(sched_process_template, sched_process_free, TP_ARGS(p)); /* - * Tracepoint for a task exiting: + * Tracepoint for a task exiting. + * Note, it's a superset of sched_process_template and should be kept + * compatible as much as possible. sched_process_exits has an extra + * `group_dead` argument, so sched_process_template can't be used, + * unfortunately, just like sched_migrate_task above. */ -DEFINE_EVENT(sched_process_template, sched_process_exit, - TP_PROTO(struct task_struct *p), - TP_ARGS(p)); +TRACE_EVENT(sched_process_exit, + + TP_PROTO(struct task_struct *p, bool group_dead), + + TP_ARGS(p, group_dead), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( bool, group_dead ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ + __entry->group_dead = group_dead; + ), + + TP_printk("comm=%s pid=%d prio=%d group_dead=%s", + __entry->comm, __entry->pid, __entry->prio, + __entry->group_dead ? "true" : "false" + ) +); /* * Tracepoint for waiting on task to unschedule: diff --git a/kernel/exit.c b/kernel/exit.c index 1b51dc099f1e..f1db86dcbeb1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -936,12 +936,12 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); + trace_sched_process_exit(tsk, group_dead); exit_mm(); if (group_dead) acct_process(); - trace_sched_process_exit(tsk); exit_sem(tsk); exit_shm(tsk); -- cgit v1.2.3-59-g8ed1b From 3330dc1b2074629c74008ea0ee0b4fb279ce8992 Mon Sep 17 00:00:00 2001 From: Francesco Valla Date: Sun, 16 Mar 2025 21:50:15 +0100 Subject: init/main.c: log initcall level when initcall_debug is used When initcall_debug is specified on the command line, the start and return point for each initcall is printed. However, no information on the initcall level is reported. Add to the initcall_debug infrastructure an additional print that informs when a new initcall level is entered. This is particularly useful when debugging dependency chains and/or working on boot time reduction. Link: https://lkml.kernel.org/r/20250316205014.2830071-2-francesco@valla.it Signed-off-by: Francesco Valla Cc: Steven Rostedt Cc: Tim Bird Signed-off-by: Andrew Morton --- init/main.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/init/main.c b/init/main.c index 7f0a2a3dbd29..6b14e6116a1f 100644 --- a/init/main.c +++ b/init/main.c @@ -1214,6 +1214,12 @@ trace_initcall_finish_cb(void *data, initcall_t fn, int ret) fn, ret, (unsigned long long)ktime_us_delta(rettime, *calltime)); } +static __init_or_module void +trace_initcall_level_cb(void *data, const char *level) +{ + printk(KERN_DEBUG "entering initcall level: %s\n", level); +} + static ktime_t initcall_calltime; #ifdef TRACEPOINTS_ENABLED @@ -1225,10 +1231,12 @@ static void __init initcall_debug_enable(void) &initcall_calltime); ret |= register_trace_initcall_finish(trace_initcall_finish_cb, &initcall_calltime); + ret |= register_trace_initcall_level(trace_initcall_level_cb, NULL); WARN(ret, "Failed to register initcall tracepoints\n"); } # define do_trace_initcall_start trace_initcall_start # define do_trace_initcall_finish trace_initcall_finish +# define do_trace_initcall_level trace_initcall_level #else static inline void do_trace_initcall_start(initcall_t fn) { @@ -1242,6 +1250,12 @@ static inline void do_trace_initcall_finish(initcall_t fn, int ret) return; trace_initcall_finish_cb(&initcall_calltime, fn, ret); } +static inline void do_trace_initcall_level(const char *level) +{ + if (!initcall_debug) + return; + trace_initcall_level_cb(NULL, level); +} #endif /* !TRACEPOINTS_ENABLED */ int __init_or_module do_one_initcall(initcall_t fn) @@ -1314,7 +1328,7 @@ static void __init do_initcall_level(int level, char *command_line) level, level, NULL, ignore_unknown_bootoption); - trace_initcall_level(initcall_level_names[level]); + do_trace_initcall_level(initcall_level_names[level]); for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++) do_one_initcall(initcall_from_entry(fn)); } @@ -1358,7 +1372,7 @@ static void __init do_pre_smp_initcalls(void) { initcall_entry_t *fn; - trace_initcall_level("early"); + do_trace_initcall_level("early"); for (fn = __initcall_start; fn < __initcall0_start; fn++) do_one_initcall(initcall_from_entry(fn)); } -- cgit v1.2.3-59-g8ed1b From 247021624a99a35e0b04cfd784a84bd65b5a0c59 Mon Sep 17 00:00:00 2001 From: Zhiquan Li Date: Thu, 3 Apr 2025 11:08:01 +0800 Subject: crash: export PAGE_UNACCEPTED_MAPCOUNT_VALUE to vmcoreinfo On Intel TDX guest, unaccepted memory is unusable free memory which is not managed by buddy, until it's accepted by guest. Before that, it cannot be accessed by the first kernel as well as the kexec'ed kernel. The kexec'ed kernel will skip these pages and fill in zero data for the reader of vmcore. The dump tool like makedumpfile creates a page descriptor (size 24 bytes) for each non-free page, including zero data page, but it will not create descriptor for free pages. If it is not able to distinguish these unaccepted pages with zero data pages, a certain amount of space will be wasted in proportion (~1/170). In fact, as a special kind of free page the unaccepted pages should be excluded, like the real free pages. Export the page type PAGE_UNACCEPTED_MAPCOUNT_VALUE to vmcoreinfo, so that dump tool can identify whether a page is unaccepted. [zhiquan1.li@intel.com: fix docs: "Title underline too short" warning] Link: https://lore.kernel.org/all/20240809114854.3745464-5-kirill.shutemov@linux.intel.com/ Link: https://lkml.kernel.org/r/20250405060610.860465-1-zhiquan1.li@intel.com Link: https://lore.kernel.org/all/20240809114854.3745464-5-kirill.shutemov@linux.intel.com/ Link: https://lkml.kernel.org/r/20250403030801.758687-1-zhiquan1.li@intel.com Signed-off-by: Zhiquan Li Reviewed-by: Kirill A. Shutemov Acked-by: Baoquan He Cc: Dave Young Cc: Vivek Goyal Cc: Zhiquan Li Signed-off-by: Andrew Morton --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 4 ++-- kernel/vmcore_info.c | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index 0f714fc945ac..8cf4614385b7 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -331,8 +331,8 @@ PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask|P Page attributes. These flags are used to filter various unnecessary for dumping pages. -PAGE_BUDDY_MAPCOUNT_VALUE(~PG_buddy)|PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_offline) ------------------------------------------------------------------------------ +PAGE_BUDDY_MAPCOUNT_VALUE(~PG_buddy)|PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_offline)|PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_unaccepted) +------------------------------------------------------------------------------------------------------------------------- More page attributes. These flags are used to filter various unnecessary for dumping pages. diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index 1fec61603ef3..e066d31d08f8 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -210,6 +210,10 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PAGE_HUGETLB_MAPCOUNT_VALUE); #define PAGE_OFFLINE_MAPCOUNT_VALUE (PGTY_offline << 24) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); +#ifdef CONFIG_UNACCEPTED_MEMORY +#define PAGE_UNACCEPTED_MAPCOUNT_VALUE (PGTY_unaccepted << 24) + VMCOREINFO_NUMBER(PAGE_UNACCEPTED_MAPCOUNT_VALUE); +#endif #ifdef CONFIG_KALLSYMS VMCOREINFO_SYMBOL(kallsyms_names); -- cgit v1.2.3-59-g8ed1b From db80bd2cea1b7c2574578461c9de4c3d9ee7634a Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 3 Apr 2025 14:33:03 +0200 Subject: task_stack.h: remove obsolete __HAVE_ARCH_KSTACK_END check Remove __HAVE_ARCH_KSTACK_END as it has been obsolete since removal of metag architecture in v4.17. Link: https://lkml.kernel.org/r/20250403-kstack-end-v1-1-7798e71f34d1@linaro.org Signed-off-by: Pasha Tatashin Link: https://lore.kernel.org/20240311164638.2015063-2-pasha.tatashin@soleen.com Signed-off-by: Linus Walleij Signed-off-by: Andrew Morton --- include/linux/sched/task_stack.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index cffad65bdc6a..85c5a6392e02 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -106,7 +106,6 @@ static inline unsigned long stack_not_used(struct task_struct *p) #endif extern void set_task_stack_end_magic(struct task_struct *tsk); -#ifndef __HAVE_ARCH_KSTACK_END static inline int kstack_end(void *addr) { /* Reliable end of stack detection: @@ -114,6 +113,5 @@ static inline int kstack_end(void *addr) */ return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); } -#endif #endif /* _LINUX_SCHED_TASK_STACK_H */ -- cgit v1.2.3-59-g8ed1b From 3dfd79cc8772bc2f02e060aa8c0bbbba8c1a1e45 Mon Sep 17 00:00:00 2001 From: Chisheng Chen Date: Thu, 3 Apr 2025 19:26:14 +0800 Subject: lib/rbtree.c: fix the example typo Replace `sr` with `Sr`. The condition `!tmp1 || rb_is_black(tmp1)` ensures that `tmp1` (which is `sibling->rb_right`) is either NULL or a black node. Therefore, the right child of the sibling must be black, and the example should use `Sr` instead of `sr`. Link: https://lkml.kernel.org/r/20250403112614.570140-1-johnny1001s000602@gmail.com Signed-off-by: Chisheng Chen Cc: Hsin Chang Yu Signed-off-by: Andrew Morton --- lib/rbtree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/rbtree.c b/lib/rbtree.c index 989c2d615f92..5114eda6309c 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -297,9 +297,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, * / \ / \ * N S --> N sl * / \ \ - * sl sr S + * sl Sr S * \ - * sr + * Sr * * Note: p might be red, and then both * p and sl are red after rotation(which @@ -312,9 +312,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, * / \ / \ * N sl --> P S * \ / \ - * S N sr + * S N Sr * \ - * sr + * Sr */ tmp1 = tmp2->rb_right; WRITE_ONCE(sibling->rb_left, tmp1); -- cgit v1.2.3-59-g8ed1b From 65c66047259fad1b868d4454bc5af95b46a5f954 Mon Sep 17 00:00:00 2001 From: Penglei Jiang Date: Thu, 3 Apr 2025 23:33:57 -0700 Subject: proc: fix the issue of proc_mem_open returning NULL proc_mem_open() can return an errno, NULL, or mm_struct*. If it fails to acquire mm, it returns NULL, but the caller does not check for the case when the return value is NULL. The following conditions lead to failure in acquiring mm: - The task is a kernel thread (PF_KTHREAD) - The task is exiting (PF_EXITING) Changes: - Add documentation comments for the return value of proc_mem_open(). - Add checks in the caller to return -ESRCH when proc_mem_open() returns NULL. Link: https://lkml.kernel.org/r/20250404063357.78891-1-superman.xpt@gmail.com Reported-by: syzbot+f9238a0a31f9b5603fef@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000f52642060d4e3750@google.com Signed-off-by: Penglei Jiang Cc: Al Viro Cc: Adrian Ratiu Cc: Christian Brauner Cc: Felix Moessbauer Cc: Jeff layton Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Thomas Gleinxer Cc: xu xin Cc: Alexey Dobriyan Signed-off-by: Andrew Morton --- fs/proc/base.c | 12 +++++++++--- fs/proc/task_mmu.c | 12 ++++++------ fs/proc/task_nommu.c | 4 ++-- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index b0d4e1908b22..85a3f5e253d4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -827,7 +827,13 @@ static const struct file_operations proc_single_file_operations = { .release = single_release, }; - +/* + * proc_mem_open() can return errno, NULL or mm_struct*. + * + * - Returns NULL if the task has no mm (PF_KTHREAD or PF_EXITING) + * - Returns mm_struct* on success + * - Returns error code on failure + */ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) { struct task_struct *task = get_proc_task(inode); @@ -854,8 +860,8 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) { struct mm_struct *mm = proc_mem_open(inode, mode); - if (IS_ERR(mm)) - return PTR_ERR(mm); + if (IS_ERR_OR_NULL(mm)) + return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 994cde10e3f4..b9e3bd006346 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -212,8 +212,8 @@ static int proc_maps_open(struct inode *inode, struct file *file, priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - int err = PTR_ERR(priv->mm); + if (IS_ERR_OR_NULL(priv->mm)) { + int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; seq_release_private(inode, file); return err; @@ -1325,8 +1325,8 @@ static int smaps_rollup_open(struct inode *inode, struct file *file) priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - ret = PTR_ERR(priv->mm); + if (IS_ERR_OR_NULL(priv->mm)) { + ret = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; single_release(inode, file); goto out_free; @@ -2069,8 +2069,8 @@ static int pagemap_open(struct inode *inode, struct file *file) struct mm_struct *mm; mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(mm)) - return PTR_ERR(mm); + if (IS_ERR_OR_NULL(mm)) + return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index bce674533000..59bfd61d653a 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -260,8 +260,8 @@ static int maps_open(struct inode *inode, struct file *file, priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - int err = PTR_ERR(priv->mm); + if (IS_ERR_OR_NULL(priv->mm)) { + int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; seq_release_private(inode, file); return err; -- cgit v1.2.3-59-g8ed1b From df3d527495376e29333035e087ead194580ea3b2 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Tue, 25 Mar 2025 17:51:54 -0600 Subject: checkpatch: dont warn about unused macro arg on empty body Patch series "2 checkpatch fixes, one pr_info_once". 2 small tweaks to checkpatch, 1 reducing several pages of powernow "not-relevant-here" log-msgs to a few lines This patch (of 3): We currently get: WARNING: Argument 'name' is not used in function-like macro on: #define DRM_CLASSMAP_USE(name) /* nothing here */ Following this advice is wrong here, and shouldn't be fixed by ignoring args altogether; the macro should properly fail if invoked with 0 or 2+ args. Link: https://lkml.kernel.org/r/20250325235156.663269-1-jim.cromie@gmail.com Link: https://lkml.kernel.org/r/20250325235156.663269-2-jim.cromie@gmail.com Signed-off-by: Jim Cromie Acked-by: Joe Perches Reviewed-by: Louis Chauvet Cc: Andy Whitcroft Cc: Joe Perches Cc: Dwaipayan Ray Cc: Lukas Bulwahn Cc:"Rafael J. Wysocki" Cc: Viresh Kumar Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3d22bf863eec..75349f766b89 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6024,7 +6024,7 @@ sub process { } # check if this is an unused argument - if ($define_stmt !~ /\b$arg\b/) { + if ($define_stmt !~ /\b$arg\b/ && $define_stmt) { WARN("MACRO_ARG_UNUSED", "Argument '$arg' is not used in function-like macro\n" . "$herectx"); } -- cgit v1.2.3-59-g8ed1b From 15d4734c7a5837bd1a3d261ae232fa698fef39c4 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Tue, 25 Mar 2025 17:51:55 -0600 Subject: checkpatch: qualify do-while-0 advice Add a paragraph of advice qualifying the general do-while-0 advice, noting 3 possible misguidings. reduce one ERROR to WARN, for the case I actually encountered. And add 'static_assert' to named exceptions, along with some additional comments about named exceptions vs (detection of) declarative construction primitives (union, struct, [], etc). Link: https://lkml.kernel.org/r/20250325235156.663269-3-jim.cromie@gmail.com Signed-off-by: Jim Cromie Cc: Andy Whitcroft Cc: Joe Perches Cc: Dwaipayan Ray Cc: Lukas Bulwahn Cc: Louis Chauvet Cc: Viresh Kumar Signed-off-by: Andrew Morton --- scripts/checkpatch.pl | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 75349f766b89..e8afe3f765de 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -151,6 +151,24 @@ EOM exit($exitcode); } +my $DO_WHILE_0_ADVICE = q{ + do {} while (0) advice is over-stated in a few situations: + + The more obvious case is macros, like MODULE_PARM_DESC, invoked at + file-scope, where C disallows code (it must be in functions). See + $exceptions if you have one to add by name. + + More troublesome is declarative macros used at top of new scope, + like DECLARE_PER_CPU. These might just compile with a do-while-0 + wrapper, but would be incorrect. Most of these are handled by + detecting struct,union,etc declaration primitives in $exceptions. + + Theres also macros called inside an if (block), which "return" an + expression. These cannot do-while, and need a ({}) wrapper. + + Enjoy this qualification while we work to improve our heuristics. +}; + sub uniq { my %seen; return grep { !$seen{$_}++ } @_; @@ -5883,9 +5901,9 @@ sub process { } } -# multi-statement macros should be enclosed in a do while loop, grab the -# first statement and ensure its the whole macro if its not enclosed -# in a known good container +# Usually multi-statement macros should be enclosed in a do {} while +# (0) loop. Grab the first statement and ensure its the whole macro +# if its not enclosed in a known good container if ($realfile !~ m@/vmlinux.lds.h$@ && $line =~ /^.\s*\#\s*define\s*$Ident(\()?/) { my $ln = $linenr; @@ -5938,10 +5956,13 @@ sub process { my $exceptions = qr{ $Declare| + # named exceptions module_param_named| MODULE_PARM_DESC| DECLARE_PER_CPU| DEFINE_PER_CPU| + static_assert| + # declaration primitives __typeof__\(| union| struct| @@ -5976,11 +5997,11 @@ sub process { ERROR("MULTISTATEMENT_MACRO_USE_DO_WHILE", "Macros starting with if should be enclosed by a do - while loop to avoid possible if/else logic defects\n" . "$herectx"); } elsif ($dstat =~ /;/) { - ERROR("MULTISTATEMENT_MACRO_USE_DO_WHILE", - "Macros with multiple statements should be enclosed in a do - while loop\n" . "$herectx"); + WARN("MULTISTATEMENT_MACRO_USE_DO_WHILE", + "Non-declarative macros with multiple statements should be enclosed in a do - while loop\n" . "$herectx\nBUT SEE:\n$DO_WHILE_0_ADVICE"); } else { ERROR("COMPLEX_MACRO", - "Macros with complex values should be enclosed in parentheses\n" . "$herectx"); + "Macros with complex values should be enclosed in parentheses\n" . "$herectx\nBUT SEE:\n$DO_WHILE_0_ADVICE"); } } -- cgit v1.2.3-59-g8ed1b From 91e53493eeaf0fd74de7d57a162420a067cb84b9 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Tue, 25 Mar 2025 17:51:56 -0600 Subject: powernow: use pr_info_once This reduces log-msgs during boot from many pages to ~10 occurrences. I didn't investigate why it wasn't just 1, maybe its a low-level service to other modules, re-probed by each of them ? Link: https://lkml.kernel.org/r/20250325235156.663269-4-jim.cromie@gmail.com Signed-off-by: Jim Cromie Cc: Andy Whitcroft Cc: Dwaipayan Ray Cc: Joe Perches Cc: Louis Chauvet Cc: Lukas Bulwahn Cc: Viresh Kumar Signed-off-by: Andrew Morton --- drivers/cpufreq/powernow-k8.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c index 4e3ba6e68c32..f7512b4e923e 100644 --- a/drivers/cpufreq/powernow-k8.c +++ b/drivers/cpufreq/powernow-k8.c @@ -482,7 +482,7 @@ static void check_supported_cpu(void *_rc) cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); if ((edx & P_STATE_TRANSITION_CAPABLE) != P_STATE_TRANSITION_CAPABLE) { - pr_info("Power state transitions not supported\n"); + pr_info_once("Power state transitions not supported\n"); return; } *rc = 0; -- cgit v1.2.3-59-g8ed1b From 4ef5211ee68113070bd42142f06347866675055e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 24 Mar 2025 12:50:24 +0200 Subject: kernel.h: move READ/WRITE definitions to Patch series "kernel.h: Move out a couple of macros and constants". kernel.h hosts a couple of macros and constants that may be better placed. Do that. Also add missing documentation. No functional changes intended. This patch (of 2): Headers shouldn't be forced to include just to gain these simple constants. Link: https://lkml.kernel.org/r/20250324105228.775784-1-andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20250324105228.775784-2-andriy.shevchenko@linux.intel.com Signed-off-by: Ingo Molnar Signed-off-by: Andy Shevchenko Cc: Alexandru Ardelean Signed-off-by: Andrew Morton --- include/linux/kernel.h | 4 ---- include/linux/types.h | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index be2e8c0a187e..01bb0fac3667 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -41,10 +41,6 @@ #define STACK_MAGIC 0xdeadbeef -/* generic data direction definitions */ -#define READ 0 -#define WRITE 1 - #define PTR_IF(cond, ptr) ((cond) ? (ptr) : NULL) #define u64_to_user_ptr(x) ( \ diff --git a/include/linux/types.h b/include/linux/types.h index 49b79c8bb1a9..6dfdb8e8e4c3 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -136,6 +136,10 @@ typedef s64 ktime_t; typedef u64 sector_t; typedef u64 blkcnt_t; +/* generic data direction definitions */ +#define READ 0 +#define WRITE 1 + /* * The type of an index into the pagecache. */ -- cgit v1.2.3-59-g8ed1b From 029c896c4105b7d74fcd88d99c5fbab2558fee89 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 24 Mar 2025 12:50:25 +0200 Subject: kernel.h: move PTR_IF() and u64_to_user_ptr() to util_macros.h While the natural choice of PTR_IF() is kconfig.h, the latter is too broad to include C code and actually the macro was moved out from there in the past. But kernel.h is neither a good choice for that. Move it to util_macros.h. Do the same for u64_to_user_ptr(). While moving, add necessary documentation. Link: https://lkml.kernel.org/r/20250324105228.775784-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Alexandru Ardelean Cc: Ingo Molnar Signed-off-by: Andrew Morton --- include/linux/kernel.h | 10 +------ include/linux/util_macros.h | 66 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 01bb0fac3667..1cce1f6410a9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -41,15 +42,6 @@ #define STACK_MAGIC 0xdeadbeef -#define PTR_IF(cond, ptr) ((cond) ? (ptr) : NULL) - -#define u64_to_user_ptr(x) ( \ -{ \ - typecheck(u64, (x)); \ - (void __user *)(uintptr_t)(x); \ -} \ -) - struct completion; struct user; diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h index 3b570b765b75..7b64fb597f85 100644 --- a/include/linux/util_macros.h +++ b/include/linux/util_macros.h @@ -79,6 +79,72 @@ (__fc_i); \ }) +/** + * PTR_IF - evaluate to @ptr if @cond is true, or to NULL otherwise. + * @cond: A conditional, usually in a form of IS_ENABLED(CONFIG_FOO) + * @ptr: A pointer to assign if @cond is true. + * + * PTR_IF(IS_ENABLED(CONFIG_FOO), ptr) evaluates to @ptr if CONFIG_FOO is set + * to 'y' or 'm', or to NULL otherwise. The (ptr) argument must be a pointer. + * + * The macro can be very useful to help compiler dropping dead code. + * + * For instance, consider the following:: + * + * #ifdef CONFIG_FOO_SUSPEND + * static int foo_suspend(struct device *dev) + * { + * ... + * } + * #endif + * + * static struct pm_ops foo_ops = { + * #ifdef CONFIG_FOO_SUSPEND + * .suspend = foo_suspend, + * #endif + * }; + * + * While this works, the foo_suspend() macro is compiled conditionally, + * only when CONFIG_FOO_SUSPEND is set. This is problematic, as there could + * be a build bug in this function, we wouldn't have a way to know unless + * the configuration option is set. + * + * An alternative is to declare foo_suspend() always, but mark it + * as __maybe_unused. This works, but the __maybe_unused attribute + * is required to instruct the compiler that the function may not + * be referenced anywhere, and is safe to remove without making + * a fuss about it. This makes the programmer responsible for tagging + * the functions that can be garbage-collected. + * + * With the macro it is possible to write the following: + * + * static int foo_suspend(struct device *dev) + * { + * ... + * } + * + * static struct pm_ops foo_ops = { + * .suspend = PTR_IF(IS_ENABLED(CONFIG_FOO_SUSPEND), foo_suspend), + * }; + * + * The foo_suspend() function will now be automatically dropped by the + * compiler, and it does not require any specific attribute. + */ +#define PTR_IF(cond, ptr) ((cond) ? (ptr) : NULL) + +/** + * to_user_ptr - cast a pointer passed as u64 from user space to void __user * + * @x: The u64 value from user space, usually via IOCTL + * + * to_user_ptr() simply casts a pointer passed as u64 from user space to void + * __user * correctly. Using this lets us get rid of all the tiresome casts. + */ +#define u64_to_user_ptr(x) \ +({ \ + typecheck(u64, (x)); \ + (void __user *)(uintptr_t)(x); \ +}) + /** * is_insidevar - check if the @ptr points inside the @var memory range. * @ptr: the pointer to a memory address. -- cgit v1.2.3-59-g8ed1b From ae5b3500856fb9a565470d83033bb91786ec16f1 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 20 Mar 2025 21:25:01 -0500 Subject: kstrtox: add support for enabled and disabled in kstrtobool() In some places in the kernel there is a design pattern for sysfs attributes to use kstrtobool() in store() and str_enabled_disabled() in show(). This is counterintuitive to interact with because kstrtobool() takes on/off but str_enabled_disabled() shows enabled/disabled. Some of those sysfs uses could switch to str_on_off() but for some attributes enabled/disabled really makes more sense. Add support for kstrtobool() to accept enabled/disabled. Link: https://lkml.kernel.org/r/20250321022538.1532445-1-superm1@kernel.org Signed-off-by: Mario Limonciello Signed-off-by: Andrew Morton --- lib/kstrtox.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/kstrtox.c b/lib/kstrtox.c index d586e6af5e5a..bdde40cd69d7 100644 --- a/lib/kstrtox.c +++ b/lib/kstrtox.c @@ -351,6 +351,8 @@ int kstrtobool(const char *s, bool *res) return -EINVAL; switch (s[0]) { + case 'e': + case 'E': case 'y': case 'Y': case 't': @@ -358,6 +360,8 @@ int kstrtobool(const char *s, bool *res) case '1': *res = true; return 0; + case 'd': + case 'D': case 'n': case 'N': case 'f': -- cgit v1.2.3-59-g8ed1b From 3eff6a3e574c2f704f9be1f4867c3d0b3d804435 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Mon, 7 Apr 2025 19:44:16 +0800 Subject: errseq: eliminate special limitation for macro MAX_ERRNO Current errseq implementation depends on a very special precondition that macro MAX_ERRNO must be (2^n - 1). Eliminate the limitation by - redefining macro ERRSEQ_SHIFT - defining a new macro ERRNO_MASK instead of MAX_ERRNO for errno mask. There is no plan to change the value of MAX_ERRNO, but this makes the implementation more generic and eliminates the BUILD_BUG_ON(). Link: https://lkml.kernel.org/r/20250407-improve_errseq-v1-1-7b27cbeb8298@quicinc.com Signed-off-by: Zijun Hu Reviewed-by: Jeff Layton Signed-off-by: Andrew Morton --- lib/errseq.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/errseq.c b/lib/errseq.c index 93e9b94358dc..13a2581c5a87 100644 --- a/lib/errseq.c +++ b/lib/errseq.c @@ -34,11 +34,14 @@ */ /* The low bits are designated for error code (max of MAX_ERRNO) */ -#define ERRSEQ_SHIFT ilog2(MAX_ERRNO + 1) +#define ERRSEQ_SHIFT (ilog2(MAX_ERRNO) + 1) /* This bit is used as a flag to indicate whether the value has been seen */ #define ERRSEQ_SEEN (1 << ERRSEQ_SHIFT) +/* Leverage macro ERRSEQ_SEEN to define errno mask macro here */ +#define ERRNO_MASK (ERRSEQ_SEEN - 1) + /* The lowest bit of the counter */ #define ERRSEQ_CTR_INC (1 << (ERRSEQ_SHIFT + 1)) @@ -60,8 +63,6 @@ errseq_t errseq_set(errseq_t *eseq, int err) { errseq_t cur, old; - /* MAX_ERRNO must be able to serve as a mask */ - BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1); /* * Ensure the error code actually fits where we want it to go. If it @@ -79,7 +80,7 @@ errseq_t errseq_set(errseq_t *eseq, int err) errseq_t new; /* Clear out error bits and set new error */ - new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err; + new = (old & ~(ERRNO_MASK | ERRSEQ_SEEN)) | -err; /* Only increment if someone has looked at it */ if (old & ERRSEQ_SEEN) @@ -148,7 +149,7 @@ int errseq_check(errseq_t *eseq, errseq_t since) if (likely(cur == since)) return 0; - return -(cur & MAX_ERRNO); + return -(cur & ERRNO_MASK); } EXPORT_SYMBOL(errseq_check); @@ -200,7 +201,7 @@ int errseq_check_and_advance(errseq_t *eseq, errseq_t *since) if (new != old) cmpxchg(eseq, old, new); *since = new; - err = -(new & MAX_ERRNO); + err = -(new & ERRNO_MASK); } return err; } -- cgit v1.2.3-59-g8ed1b From fe6f600c43e0931f4fb5e1debb2311b7a6ebabd8 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 19 Mar 2025 20:54:36 +0100 Subject: exit: combine work under lock in synchronize_group_exit() and coredump_task_exit() This reduces single-threaded overhead as it avoids one lock+irq trip on exit. It also improves scalability of spawning and killing threads within one process (just shy of 5% when doing it on 24 cores on my test jig). Both routines are moved below kcov and kmsan exit, which should be harmless. Link: https://lkml.kernel.org/r/20250319195436.1864415-1-mjguzik@gmail.com Signed-off-by: Mateusz Guzik Reviewed-by: Oleg Nesterov Signed-off-by: Andrew Morton --- kernel/exit.c | 68 ++++++++++++++++++++++++++++------------------------------- 1 file changed, 32 insertions(+), 36 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index f1db86dcbeb1..921f28249b9b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -415,44 +415,30 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -static void coredump_task_exit(struct task_struct *tsk) +static void coredump_task_exit(struct task_struct *tsk, + struct core_state *core_state) { - struct core_state *core_state; + struct core_thread self; + self.task = tsk; + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; /* - * Serialize with any possible pending coredump. - * We must hold siglock around checking core_state - * and setting PF_POSTCOREDUMP. The core-inducing thread - * will increment ->nr_threads for each thread in the - * group without PF_POSTCOREDUMP set. + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. */ - spin_lock_irq(&tsk->sighand->siglock); - tsk->flags |= PF_POSTCOREDUMP; - core_state = tsk->signal->core_state; - spin_unlock_irq(&tsk->sighand->siglock); - if (core_state) { - struct core_thread self; - - self.task = current; - if (self.task->flags & PF_SIGNALED) - self.next = xchg(&core_state->dumper.next, &self); - else - self.task = NULL; - /* - * Implies mb(), the result of xchg() must be visible - * to core_state->dumper. - */ - if (atomic_dec_and_test(&core_state->nr_threads)) - complete(&core_state->startup); + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); - for (;;) { - set_current_state(TASK_IDLE|TASK_FREEZABLE); - if (!self.task) /* see coredump_finish() */ - break; - schedule(); - } - __set_current_state(TASK_RUNNING); + for (;;) { + set_current_state(TASK_IDLE|TASK_FREEZABLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); } + __set_current_state(TASK_RUNNING); } #ifdef CONFIG_MEMCG @@ -876,6 +862,7 @@ static void synchronize_group_exit(struct task_struct *tsk, long code) { struct sighand_struct *sighand = tsk->sighand; struct signal_struct *signal = tsk->signal; + struct core_state *core_state; spin_lock_irq(&sighand->siglock); signal->quick_threads--; @@ -885,7 +872,19 @@ static void synchronize_group_exit(struct task_struct *tsk, long code) signal->group_exit_code = code; signal->group_stop_count = 0; } + /* + * Serialize with any possible pending coredump. + * We must hold siglock around checking core_state + * and setting PF_POSTCOREDUMP. The core-inducing thread + * will increment ->nr_threads for each thread in the + * group without PF_POSTCOREDUMP set. + */ + tsk->flags |= PF_POSTCOREDUMP; + core_state = signal->core_state; spin_unlock_irq(&sighand->siglock); + + if (unlikely(core_state)) + coredump_task_exit(tsk, core_state); } void __noreturn do_exit(long code) @@ -894,15 +893,12 @@ void __noreturn do_exit(long code) int group_dead; WARN_ON(irqs_disabled()); - - synchronize_group_exit(tsk, code); - WARN_ON(tsk->plug); kcov_task_exit(tsk); kmsan_task_exit(tsk); - coredump_task_exit(tsk); + synchronize_group_exit(tsk, code); ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); -- cgit v1.2.3-59-g8ed1b From 734aa85390ea693bb7eaf2240623d41b03705c84 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Wed, 9 Apr 2025 03:47:47 +0100 Subject: Squashfs: check return result of sb_min_blocksize Syzkaller reports an "UBSAN: shift-out-of-bounds in squashfs_bio_read" bug. Syzkaller forks multiple processes which after mounting the Squashfs filesystem, issues an ioctl("/dev/loop0", LOOP_SET_BLOCK_SIZE, 0x8000). Now if this ioctl occurs at the same time another process is in the process of mounting a Squashfs filesystem on /dev/loop0, the failure occurs. When this happens the following code in squashfs_fill_super() fails. ---- msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); msblk->devblksize_log2 = ffz(~msblk->devblksize); ---- sb_min_blocksize() returns 0, which means msblk->devblksize is set to 0. As a result, ffz(~msblk->devblksize) returns 64, and msblk->devblksize_log2 is set to 64. This subsequently causes the UBSAN: shift-out-of-bounds in fs/squashfs/block.c:195:36 shift exponent 64 is too large for 64-bit type 'u64' (aka 'unsigned long long') This commit adds a check for a 0 return by sb_min_blocksize(). Link: https://lkml.kernel.org/r/20250409024747.876480-1-phillip@squashfs.org.uk Fixes: 0aa666190509 ("Squashfs: super block operations") Reported-by: syzbot+65761fc25a137b9c8c6e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67f0dd7a.050a0220.0a13.0230.GAE@google.com/ Signed-off-by: Phillip Lougher Signed-off-by: Andrew Morton --- fs/squashfs/super.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 67c55fe32ce8..992ea0e37257 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -202,6 +202,11 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) msblk->panic_on_errors = (opts->errors == Opt_errors_panic); msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); + if (!msblk->devblksize) { + errorf(fc, "squashfs: unable to set blocksize\n"); + return -EINVAL; + } + msblk->devblksize_log2 = ffz(~msblk->devblksize); mutex_init(&msblk->meta_index_mutex); -- cgit v1.2.3-59-g8ed1b From 50af973cd71ab9eea3b18429343659f4a6ebd825 Mon Sep 17 00:00:00 2001 From: WangYuli Date: Fri, 11 Apr 2025 18:26:10 +0800 Subject: ocfs2: o2net_idle_timer: Rename del_timer_sync in comment Commit 8fa7292fee5c ("treewide: Switch/rename to timer_delete[_sync]()") switched del_timer_sync to timer_delete_sync, but did not modify the comment for o2net_idle_timer(). Now fix it. Link: https://lkml.kernel.org/r/BDDB1E4E2876C36C+20250411102610.165946-1-wangyuli@uniontech.com Signed-off-by: WangYuli Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/cluster/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index fce9beb214f0..43e652a2adaf 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1483,7 +1483,7 @@ static void o2net_sc_send_keep_req(struct work_struct *work) sc_put(sc); } -/* socket shutdown does a del_timer_sync against this as it tears down. +/* socket shutdown does a timer_delete_sync against this as it tears down. * we can't start this timer until we've got to the point in sc buildup * where shutdown is going to be involved */ static void o2net_idle_timer(struct timer_list *t) -- cgit v1.2.3-59-g8ed1b From e711faaafbe54a884f33b53472434063d342f6d4 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Mon, 14 Apr 2025 22:59:43 +0800 Subject: hung_task: replace blocker_mutex with encoded blocker Patch series "hung_task: extend blocking task stacktrace dump to semaphore", v5. Inspired by mutex blocker tracking[1], this patch series extend the feature to not only dump the blocker task holding a mutex but also to support semaphores. Unlike mutexes, semaphores lack explicit ownership tracking, making it challenging to identify the root cause of hangs. To address this, we introduce a last_holder field to the semaphore structure, which is updated when a task successfully calls down() and cleared during up(). The assumption is that if a task is blocked on a semaphore, the holders must not have released it. While this does not guarantee that the last holder is one of the current blockers, it likely provides a practical hint for diagnosing semaphore-related stalls. With this change, the hung task detector can now show blocker task's info like below: [Tue Apr 8 12:19:07 2025] INFO: task cat:945 blocked for more than 120 seconds. [Tue Apr 8 12:19:07 2025] Tainted: G E 6.14.0-rc6+ #1 [Tue Apr 8 12:19:07 2025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [Tue Apr 8 12:19:07 2025] task:cat state:D stack:0 pid:945 tgid:945 ppid:828 task_flags:0x400000 flags:0x00000000 [Tue Apr 8 12:19:07 2025] Call Trace: [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] __schedule+0x491/0xbd0 [Tue Apr 8 12:19:07 2025] schedule+0x27/0xf0 [Tue Apr 8 12:19:07 2025] schedule_timeout+0xe3/0xf0 [Tue Apr 8 12:19:07 2025] ? __folio_mod_stat+0x2a/0x80 [Tue Apr 8 12:19:07 2025] ? set_ptes.constprop.0+0x27/0x90 [Tue Apr 8 12:19:07 2025] __down_common+0x155/0x280 [Tue Apr 8 12:19:07 2025] down+0x53/0x70 [Tue Apr 8 12:19:07 2025] read_dummy_semaphore+0x23/0x60 [Tue Apr 8 12:19:07 2025] full_proxy_read+0x5f/0xa0 [Tue Apr 8 12:19:07 2025] vfs_read+0xbc/0x350 [Tue Apr 8 12:19:07 2025] ? __count_memcg_events+0xa5/0x140 [Tue Apr 8 12:19:07 2025] ? count_memcg_events.constprop.0+0x1a/0x30 [Tue Apr 8 12:19:07 2025] ? handle_mm_fault+0x180/0x260 [Tue Apr 8 12:19:07 2025] ksys_read+0x66/0xe0 [Tue Apr 8 12:19:07 2025] do_syscall_64+0x51/0x120 [Tue Apr 8 12:19:07 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Tue Apr 8 12:19:07 2025] RIP: 0033:0x7f419478f46e [Tue Apr 8 12:19:07 2025] RSP: 002b:00007fff1c4d2668 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Tue Apr 8 12:19:07 2025] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f419478f46e [Tue Apr 8 12:19:07 2025] RDX: 0000000000020000 RSI: 00007f4194683000 RDI: 0000000000000003 [Tue Apr 8 12:19:07 2025] RBP: 00007f4194683000 R08: 00007f4194682010 R09: 0000000000000000 [Tue Apr 8 12:19:07 2025] R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000 [Tue Apr 8 12:19:07 2025] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] INFO: task cat:945 blocked on a semaphore likely last held by task cat:938 [Tue Apr 8 12:19:07 2025] task:cat state:S stack:0 pid:938 tgid:938 ppid:584 task_flags:0x400000 flags:0x00000000 [Tue Apr 8 12:19:07 2025] Call Trace: [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] __schedule+0x491/0xbd0 [Tue Apr 8 12:19:07 2025] ? _raw_spin_unlock_irqrestore+0xe/0x40 [Tue Apr 8 12:19:07 2025] schedule+0x27/0xf0 [Tue Apr 8 12:19:07 2025] schedule_timeout+0x77/0xf0 [Tue Apr 8 12:19:07 2025] ? __pfx_process_timeout+0x10/0x10 [Tue Apr 8 12:19:07 2025] msleep_interruptible+0x49/0x60 [Tue Apr 8 12:19:07 2025] read_dummy_semaphore+0x2d/0x60 [Tue Apr 8 12:19:07 2025] full_proxy_read+0x5f/0xa0 [Tue Apr 8 12:19:07 2025] vfs_read+0xbc/0x350 [Tue Apr 8 12:19:07 2025] ? __count_memcg_events+0xa5/0x140 [Tue Apr 8 12:19:07 2025] ? count_memcg_events.constprop.0+0x1a/0x30 [Tue Apr 8 12:19:07 2025] ? handle_mm_fault+0x180/0x260 [Tue Apr 8 12:19:07 2025] ksys_read+0x66/0xe0 [Tue Apr 8 12:19:07 2025] do_syscall_64+0x51/0x120 [Tue Apr 8 12:19:07 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Tue Apr 8 12:19:07 2025] RIP: 0033:0x7f7c584a646e [Tue Apr 8 12:19:07 2025] RSP: 002b:00007ffdba8ce158 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Tue Apr 8 12:19:07 2025] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f7c584a646e [Tue Apr 8 12:19:07 2025] RDX: 0000000000020000 RSI: 00007f7c5839a000 RDI: 0000000000000003 [Tue Apr 8 12:19:07 2025] RBP: 00007f7c5839a000 R08: 00007f7c58399010 R09: 0000000000000000 [Tue Apr 8 12:19:07 2025] R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000 [Tue Apr 8 12:19:07 2025] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [Tue Apr 8 12:19:07 2025] This patch (of 3): This patch replaces 'struct mutex *blocker_mutex' with 'unsigned long blocker', as only one blocker is active at a time. The blocker filed can store both the lock addrees and the lock type, with LSB used to encode the type as Masami suggested, making it easier to extend the feature to cover other types of locks. Also, once the lock type is determined, we can directly extract the address and cast it to a lock pointer ;) Link: https://lkml.kernel.org/r/20250414145945.84916-1-ioworker0@gmail.com Link: https://lore.kernel.org/all/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com [1] Link: https://lkml.kernel.org/r/20250414145945.84916-2-ioworker0@gmail.com Signed-off-by: Mingzhe Yang Signed-off-by: Lance Yang Reviewed-by: Masami Hiramatsu (Google) Suggested-by: Andrew Morton Suggested-by: Masami Hiramatsu (Google) Cc: Anna Schumaker Cc: Boqun Feng Cc: Ingo Molnar Cc: Joel Granados Cc: John Stultz Cc: Kent Overstreet Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Tomasz Figa Cc: Waiman Long Cc: Will Deacon Cc: Yongliang Gao Cc: Zi Li Signed-off-by: Andrew Morton --- include/linux/hung_task.h | 99 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 6 ++- kernel/hung_task.c | 13 ++++--- kernel/locking/mutex.c | 5 ++- 4 files changed, 115 insertions(+), 8 deletions(-) create mode 100644 include/linux/hung_task.h diff --git a/include/linux/hung_task.h b/include/linux/hung_task.h new file mode 100644 index 000000000000..1bc2b3244613 --- /dev/null +++ b/include/linux/hung_task.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Detect Hung Task: detecting tasks stuck in D state + * + * Copyright (C) 2025 Tongcheng Travel (www.ly.com) + * Author: Lance Yang + */ +#ifndef __LINUX_HUNG_TASK_H +#define __LINUX_HUNG_TASK_H + +#include +#include +#include + +/* + * @blocker: Combines lock address and blocking type. + * + * Since lock pointers are at least 4-byte aligned(32-bit) or 8-byte + * aligned(64-bit). This leaves the 2 least bits (LSBs) of the pointer + * always zero. So we can use these bits to encode the specific blocking + * type. + * + * Type encoding: + * 00 - Blocked on mutex (BLOCKER_TYPE_MUTEX) + * 01 - Blocked on semaphore (BLOCKER_TYPE_SEM) + * 10 - Blocked on rt-mutex (BLOCKER_TYPE_RTMUTEX) + * 11 - Blocked on rw-semaphore (BLOCKER_TYPE_RWSEM) + */ +#define BLOCKER_TYPE_MUTEX 0x00UL +#define BLOCKER_TYPE_SEM 0x01UL +#define BLOCKER_TYPE_RTMUTEX 0x02UL +#define BLOCKER_TYPE_RWSEM 0x03UL + +#define BLOCKER_TYPE_MASK 0x03UL + +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +static inline void hung_task_set_blocker(void *lock, unsigned long type) +{ + unsigned long lock_ptr = (unsigned long)lock; + + WARN_ON_ONCE(!lock_ptr); + WARN_ON_ONCE(READ_ONCE(current->blocker)); + + /* + * If the lock pointer matches the BLOCKER_TYPE_MASK, return + * without writing anything. + */ + if (WARN_ON_ONCE(lock_ptr & BLOCKER_TYPE_MASK)) + return; + + WRITE_ONCE(current->blocker, lock_ptr | type); +} + +static inline void hung_task_clear_blocker(void) +{ + WARN_ON_ONCE(!READ_ONCE(current->blocker)); + + WRITE_ONCE(current->blocker, 0UL); +} + +/* + * hung_task_get_blocker_type - Extracts blocker type from encoded blocker + * address. + * + * @blocker: Blocker pointer with encoded type (via LSB bits) + * + * Returns: BLOCKER_TYPE_MUTEX, BLOCKER_TYPE_SEM, etc. + */ +static inline unsigned long hung_task_get_blocker_type(unsigned long blocker) +{ + WARN_ON_ONCE(!blocker); + + return blocker & BLOCKER_TYPE_MASK; +} + +static inline void *hung_task_blocker_to_lock(unsigned long blocker) +{ + WARN_ON_ONCE(!blocker); + + return (void *)(blocker & ~BLOCKER_TYPE_MASK); +} +#else +static inline void hung_task_set_blocker(void *lock, unsigned long type) +{ +} +static inline void hung_task_clear_blocker(void) +{ +} +static inline unsigned long hung_task_get_blocker_type(unsigned long blocker) +{ + return 0UL; +} +static inline void *hung_task_blocker_to_lock(unsigned long blocker) +{ + return NULL; +} +#endif + +#endif /* __LINUX_HUNG_TASK_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index f96ac1982893..393d81978f40 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1240,7 +1240,11 @@ struct task_struct { #endif #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER - struct mutex *blocker_mutex; + /* + * Encoded lock address causing task block (lower 2 bits = type from + * ). Accessed via hung_task_*() helpers. + */ + unsigned long blocker; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP diff --git a/kernel/hung_task.c b/kernel/hung_task.c index dc898ec93463..79558d76ef06 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -98,16 +99,18 @@ static struct notifier_block panic_block = { static void debug_show_blocker(struct task_struct *task) { struct task_struct *g, *t; - unsigned long owner; - struct mutex *lock; + unsigned long owner, blocker; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); - lock = READ_ONCE(task->blocker_mutex); - if (!lock) + blocker = READ_ONCE(task->blocker); + if (!blocker || + hung_task_get_blocker_type(blocker) != BLOCKER_TYPE_MUTEX) return; - owner = mutex_get_owner(lock); + owner = mutex_get_owner( + (struct mutex *)hung_task_blocker_to_lock(blocker)); + if (unlikely(!owner)) { pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", task->comm, task->pid); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 555e2b3a665a..61fa97da7989 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -29,6 +29,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -191,7 +192,7 @@ __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct list_head *list) { #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER - WRITE_ONCE(current->blocker_mutex, lock); + hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX); #endif debug_mutex_add_waiter(lock, waiter, current); @@ -209,7 +210,7 @@ __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) debug_mutex_remove_waiter(lock, waiter, current); #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER - WRITE_ONCE(current->blocker_mutex, NULL); + hung_task_clear_blocker(); #endif } -- cgit v1.2.3-59-g8ed1b From 194a9b9e843b4077033be70a07d191107972aa53 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Mon, 14 Apr 2025 22:59:44 +0800 Subject: hung_task: show the blocker task if the task is hung on semaphore Inspired by mutex blocker tracking[1], this patch makes a trade-off to balance the overhead and utility of the hung task detector. Unlike mutexes, semaphores lack explicit ownership tracking, making it challenging to identify the root cause of hangs. To address this, we introduce a last_holder field to the semaphore structure, which is updated when a task successfully calls down() and cleared during up(). The assumption is that if a task is blocked on a semaphore, the holders must not have released it. While this does not guarantee that the last holder is one of the current blockers, it likely provides a practical hint for diagnosing semaphore-related stalls. With this change, the hung task detector can now show blocker task's info like below: [Tue Apr 8 12:19:07 2025] INFO: task cat:945 blocked for more than 120 seconds. [Tue Apr 8 12:19:07 2025] Tainted: G E 6.14.0-rc6+ #1 [Tue Apr 8 12:19:07 2025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [Tue Apr 8 12:19:07 2025] task:cat state:D stack:0 pid:945 tgid:945 ppid:828 task_flags:0x400000 flags:0x00000000 [Tue Apr 8 12:19:07 2025] Call Trace: [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] __schedule+0x491/0xbd0 [Tue Apr 8 12:19:07 2025] schedule+0x27/0xf0 [Tue Apr 8 12:19:07 2025] schedule_timeout+0xe3/0xf0 [Tue Apr 8 12:19:07 2025] ? __folio_mod_stat+0x2a/0x80 [Tue Apr 8 12:19:07 2025] ? set_ptes.constprop.0+0x27/0x90 [Tue Apr 8 12:19:07 2025] __down_common+0x155/0x280 [Tue Apr 8 12:19:07 2025] down+0x53/0x70 [Tue Apr 8 12:19:07 2025] read_dummy_semaphore+0x23/0x60 [Tue Apr 8 12:19:07 2025] full_proxy_read+0x5f/0xa0 [Tue Apr 8 12:19:07 2025] vfs_read+0xbc/0x350 [Tue Apr 8 12:19:07 2025] ? __count_memcg_events+0xa5/0x140 [Tue Apr 8 12:19:07 2025] ? count_memcg_events.constprop.0+0x1a/0x30 [Tue Apr 8 12:19:07 2025] ? handle_mm_fault+0x180/0x260 [Tue Apr 8 12:19:07 2025] ksys_read+0x66/0xe0 [Tue Apr 8 12:19:07 2025] do_syscall_64+0x51/0x120 [Tue Apr 8 12:19:07 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Tue Apr 8 12:19:07 2025] RIP: 0033:0x7f419478f46e [Tue Apr 8 12:19:07 2025] RSP: 002b:00007fff1c4d2668 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Tue Apr 8 12:19:07 2025] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f419478f46e [Tue Apr 8 12:19:07 2025] RDX: 0000000000020000 RSI: 00007f4194683000 RDI: 0000000000000003 [Tue Apr 8 12:19:07 2025] RBP: 00007f4194683000 R08: 00007f4194682010 R09: 0000000000000000 [Tue Apr 8 12:19:07 2025] R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000 [Tue Apr 8 12:19:07 2025] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] INFO: task cat:945 blocked on a semaphore likely last held by task cat:938 [Tue Apr 8 12:19:07 2025] task:cat state:S stack:0 pid:938 tgid:938 ppid:584 task_flags:0x400000 flags:0x00000000 [Tue Apr 8 12:19:07 2025] Call Trace: [Tue Apr 8 12:19:07 2025] [Tue Apr 8 12:19:07 2025] __schedule+0x491/0xbd0 [Tue Apr 8 12:19:07 2025] ? _raw_spin_unlock_irqrestore+0xe/0x40 [Tue Apr 8 12:19:07 2025] schedule+0x27/0xf0 [Tue Apr 8 12:19:07 2025] schedule_timeout+0x77/0xf0 [Tue Apr 8 12:19:07 2025] ? __pfx_process_timeout+0x10/0x10 [Tue Apr 8 12:19:07 2025] msleep_interruptible+0x49/0x60 [Tue Apr 8 12:19:07 2025] read_dummy_semaphore+0x2d/0x60 [Tue Apr 8 12:19:07 2025] full_proxy_read+0x5f/0xa0 [Tue Apr 8 12:19:07 2025] vfs_read+0xbc/0x350 [Tue Apr 8 12:19:07 2025] ? __count_memcg_events+0xa5/0x140 [Tue Apr 8 12:19:07 2025] ? count_memcg_events.constprop.0+0x1a/0x30 [Tue Apr 8 12:19:07 2025] ? handle_mm_fault+0x180/0x260 [Tue Apr 8 12:19:07 2025] ksys_read+0x66/0xe0 [Tue Apr 8 12:19:07 2025] do_syscall_64+0x51/0x120 [Tue Apr 8 12:19:07 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Tue Apr 8 12:19:07 2025] RIP: 0033:0x7f7c584a646e [Tue Apr 8 12:19:07 2025] RSP: 002b:00007ffdba8ce158 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Tue Apr 8 12:19:07 2025] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f7c584a646e [Tue Apr 8 12:19:07 2025] RDX: 0000000000020000 RSI: 00007f7c5839a000 RDI: 0000000000000003 [Tue Apr 8 12:19:07 2025] RBP: 00007f7c5839a000 R08: 00007f7c58399010 R09: 0000000000000000 [Tue Apr 8 12:19:07 2025] R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000 [Tue Apr 8 12:19:07 2025] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 [Tue Apr 8 12:19:07 2025] [1] https://lore.kernel.org/all/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com Link: https://lkml.kernel.org/r/20250414145945.84916-3-ioworker0@gmail.com Signed-off-by: Mingzhe Yang Signed-off-by: Lance Yang Suggested-by: Andrew Morton Suggested-by: Masami Hiramatsu (Google) Reviewed-by: Masami Hiramatsu (Google) Cc: Anna Schumaker Cc: Boqun Feng Cc: Ingo Molnar Cc: Joel Granados Cc: John Stultz Cc: Kent Overstreet Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Tomasz Figa Cc: Waiman Long Cc: Will Deacon Cc: Yongliang Gao Cc: Zi Li Signed-off-by: Andrew Morton --- include/linux/semaphore.h | 15 +++++++++++- kernel/hung_task.c | 52 +++++++++++++++++++++++++++++++++--------- kernel/locking/semaphore.c | 57 +++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 106 insertions(+), 18 deletions(-) diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h index 04655faadc2d..89706157e622 100644 --- a/include/linux/semaphore.h +++ b/include/linux/semaphore.h @@ -16,13 +16,25 @@ struct semaphore { raw_spinlock_t lock; unsigned int count; struct list_head wait_list; + +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + unsigned long last_holder; +#endif }; +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +#define __LAST_HOLDER_SEMAPHORE_INITIALIZER \ + , .last_holder = 0UL +#else +#define __LAST_HOLDER_SEMAPHORE_INITIALIZER +#endif + #define __SEMAPHORE_INITIALIZER(name, n) \ { \ .lock = __RAW_SPIN_LOCK_UNLOCKED((name).lock), \ .count = n, \ - .wait_list = LIST_HEAD_INIT((name).wait_list), \ + .wait_list = LIST_HEAD_INIT((name).wait_list) \ + __LAST_HOLDER_SEMAPHORE_INITIALIZER \ } /* @@ -47,5 +59,6 @@ extern int __must_check down_killable(struct semaphore *sem); extern int __must_check down_trylock(struct semaphore *sem); extern int __must_check down_timeout(struct semaphore *sem, long jiffies); extern void up(struct semaphore *sem); +extern unsigned long sem_last_holder(struct semaphore *sem); #endif /* __LINUX_SEMAPHORE_H */ diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 79558d76ef06..d2432df2b905 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -99,32 +99,62 @@ static struct notifier_block panic_block = { static void debug_show_blocker(struct task_struct *task) { struct task_struct *g, *t; - unsigned long owner, blocker; + unsigned long owner, blocker, blocker_type; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); blocker = READ_ONCE(task->blocker); - if (!blocker || - hung_task_get_blocker_type(blocker) != BLOCKER_TYPE_MUTEX) + if (!blocker) return; - owner = mutex_get_owner( - (struct mutex *)hung_task_blocker_to_lock(blocker)); + blocker_type = hung_task_get_blocker_type(blocker); + + switch (blocker_type) { + case BLOCKER_TYPE_MUTEX: + owner = mutex_get_owner( + (struct mutex *)hung_task_blocker_to_lock(blocker)); + break; + case BLOCKER_TYPE_SEM: + owner = sem_last_holder( + (struct semaphore *)hung_task_blocker_to_lock(blocker)); + break; + default: + WARN_ON_ONCE(1); + return; + } + if (unlikely(!owner)) { - pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", - task->comm, task->pid); + switch (blocker_type) { + case BLOCKER_TYPE_MUTEX: + pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", + task->comm, task->pid); + break; + case BLOCKER_TYPE_SEM: + pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n", + task->comm, task->pid); + break; + } return; } /* Ensure the owner information is correct. */ for_each_process_thread(g, t) { - if ((unsigned long)t == owner) { + if ((unsigned long)t != owner) + continue; + + switch (blocker_type) { + case BLOCKER_TYPE_MUTEX: pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n", - task->comm, task->pid, t->comm, t->pid); - sched_show_task(t); - return; + task->comm, task->pid, t->comm, t->pid); + break; + case BLOCKER_TYPE_SEM: + pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n", + task->comm, task->pid, t->comm, t->pid); + break; } + sched_show_task(t); + return; } } #else diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index de9117c0e671..3ef032e22f7e 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -34,6 +34,7 @@ #include #include #include +#include static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); @@ -41,6 +42,41 @@ static noinline int __down_killable(struct semaphore *sem); static noinline int __down_timeout(struct semaphore *sem, long timeout); static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q); +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +static inline void hung_task_sem_set_holder(struct semaphore *sem) +{ + WRITE_ONCE((sem)->last_holder, (unsigned long)current); +} + +static inline void hung_task_sem_clear_if_holder(struct semaphore *sem) +{ + if (READ_ONCE((sem)->last_holder) == (unsigned long)current) + WRITE_ONCE((sem)->last_holder, 0UL); +} + +unsigned long sem_last_holder(struct semaphore *sem) +{ + return READ_ONCE(sem->last_holder); +} +#else +static inline void hung_task_sem_set_holder(struct semaphore *sem) +{ +} +static inline void hung_task_sem_clear_if_holder(struct semaphore *sem) +{ +} +unsigned long sem_last_holder(struct semaphore *sem) +{ + return 0UL; +} +#endif + +static inline void __sem_acquire(struct semaphore *sem) +{ + sem->count--; + hung_task_sem_set_holder(sem); +} + /** * down - acquire the semaphore * @sem: the semaphore to be acquired @@ -59,7 +95,7 @@ void __sched down(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else __down(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -83,7 +119,7 @@ int __sched down_interruptible(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_interruptible(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -110,7 +146,7 @@ int __sched down_killable(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_killable(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -140,7 +176,7 @@ int __sched down_trylock(struct semaphore *sem) raw_spin_lock_irqsave(&sem->lock, flags); count = sem->count - 1; if (likely(count >= 0)) - sem->count = count; + __sem_acquire(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); return (count < 0); @@ -165,7 +201,7 @@ int __sched down_timeout(struct semaphore *sem, long timeout) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_timeout(sem, timeout); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -187,6 +223,9 @@ void __sched up(struct semaphore *sem) DEFINE_WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->lock, flags); + + hung_task_sem_clear_if_holder(sem); + if (likely(list_empty(&sem->wait_list))) sem->count++; else @@ -228,8 +267,10 @@ static inline int __sched ___down_common(struct semaphore *sem, long state, raw_spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); raw_spin_lock_irq(&sem->lock); - if (waiter.up) + if (waiter.up) { + hung_task_sem_set_holder(sem); return 0; + } } timed_out: @@ -246,10 +287,14 @@ static inline int __sched __down_common(struct semaphore *sem, long state, { int ret; + hung_task_set_blocker(sem, BLOCKER_TYPE_SEM); + trace_contention_begin(sem, 0); ret = ___down_common(sem, state, timeout); trace_contention_end(sem, ret); + hung_task_clear_blocker(); + return ret; } -- cgit v1.2.3-59-g8ed1b From 1abf729e9d9f4eb42c01fa9cb86eeb1766ecd49f Mon Sep 17 00:00:00 2001 From: Zi Li Date: Mon, 14 Apr 2025 22:59:45 +0800 Subject: samples: extend hung_task detector test with semaphore support Extend the existing hung_task detector test module to support multiple lock types, including mutex and semaphore, with room for future additions (e.g., spinlock, etc.). This module creates dummy files under /hung_task, such as 'mutex' and 'semaphore'. The read process on any of these files will sleep for enough long time (256 seconds) while holding the respective lock. As a result, the second process will wait on the lock for a prolonged duration and be detected by the hung_task detector. This change unifies the previous mutex-only sample into a single, extensible hung_task_tests module, reducing code duplication and improving maintainability. Usage is: > cd /sys/kernel/debug/hung_task > cat mutex & cat mutex # Test mutex blocking > cat semaphore & cat semaphore # Test semaphore blocking Update the Kconfig description to reflect multiple debugfs files support. Link: https://lkml.kernel.org/r/20250414145945.84916-4-ioworker0@gmail.com Signed-off-by: Lance Yang Signed-off-by: Zi Li Suggested-by: Masami Hiramatsu (Google) Acked-by: Masami Hiramatsu (Google) Cc: Anna Schumaker Cc: Boqun Feng Cc: Ingo Molnar Cc: Joel Granados Cc: John Stultz Cc: Kent Overstreet Cc: Mingzhe Yang Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Tomasz Figa Cc: Waiman Long Cc: Will Deacon Cc: Yongliang Gao Signed-off-by: Andrew Morton --- samples/Kconfig | 9 ++-- samples/hung_task/Makefile | 2 +- samples/hung_task/hung_task_mutex.c | 66 ------------------------- samples/hung_task/hung_task_tests.c | 97 +++++++++++++++++++++++++++++++++++++ 4 files changed, 103 insertions(+), 71 deletions(-) delete mode 100644 samples/hung_task/hung_task_mutex.c create mode 100644 samples/hung_task/hung_task_tests.c diff --git a/samples/Kconfig b/samples/Kconfig index 09011be2391a..753ed1f170b5 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -304,10 +304,11 @@ config SAMPLE_HUNG_TASK tristate "Hung task detector test code" depends on DETECT_HUNG_TASK && DEBUG_FS help - Build a module which provide a simple debugfs file. If user reads - the file, it will sleep long time (256 seconds) with holding a - mutex. Thus if there are 2 or more processes read this file, it - will be detected by the hung_task watchdog. + Build a module that provides debugfs files (e.g., mutex, semaphore, + etc.) under /hung_task. If user reads one of these files, + it will sleep long time (256 seconds) with holding a lock. Thus, + if 2 or more processes read the same file concurrently, it will + be detected by the hung_task watchdog. source "samples/rust/Kconfig" diff --git a/samples/hung_task/Makefile b/samples/hung_task/Makefile index f4d6ab563488..86036f1a204d 100644 --- a/samples/hung_task/Makefile +++ b/samples/hung_task/Makefile @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_SAMPLE_HUNG_TASK) += hung_task_mutex.o +obj-$(CONFIG_SAMPLE_HUNG_TASK) += hung_task_tests.o diff --git a/samples/hung_task/hung_task_mutex.c b/samples/hung_task/hung_task_mutex.c deleted file mode 100644 index 47ed38239ea3..000000000000 --- a/samples/hung_task/hung_task_mutex.c +++ /dev/null @@ -1,66 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * hung_task_mutex.c - Sample code which causes hung task by mutex - * - * Usage: load this module and read `/hung_task/mutex` - * by 2 or more processes. - * - * This is for testing kernel hung_task error message. - * Note that this will make your system freeze and maybe - * cause panic. So do not use this except for the test. - */ - -#include -#include -#include -#include -#include - -#define HUNG_TASK_DIR "hung_task" -#define HUNG_TASK_FILE "mutex" -#define SLEEP_SECOND 256 - -static const char dummy_string[] = "This is a dummy string."; -static DEFINE_MUTEX(dummy_mutex); -static struct dentry *hung_task_dir; - -static ssize_t read_dummy(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) -{ - /* If the second task waits on the lock, it is uninterruptible sleep. */ - guard(mutex)(&dummy_mutex); - - /* When the first task sleep here, it is interruptible. */ - msleep_interruptible(SLEEP_SECOND * 1000); - - return simple_read_from_buffer(user_buf, count, ppos, - dummy_string, sizeof(dummy_string)); -} - -static const struct file_operations hung_task_fops = { - .read = read_dummy, -}; - -static int __init hung_task_sample_init(void) -{ - hung_task_dir = debugfs_create_dir(HUNG_TASK_DIR, NULL); - if (IS_ERR(hung_task_dir)) - return PTR_ERR(hung_task_dir); - - debugfs_create_file(HUNG_TASK_FILE, 0400, hung_task_dir, - NULL, &hung_task_fops); - - return 0; -} - -static void __exit hung_task_sample_exit(void) -{ - debugfs_remove_recursive(hung_task_dir); -} - -module_init(hung_task_sample_init); -module_exit(hung_task_sample_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Masami Hiramatsu"); -MODULE_DESCRIPTION("Simple sleep under mutex file for testing hung task"); diff --git a/samples/hung_task/hung_task_tests.c b/samples/hung_task/hung_task_tests.c new file mode 100644 index 000000000000..a5c09bd3a47d --- /dev/null +++ b/samples/hung_task/hung_task_tests.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * hung_task_tests.c - Sample code for testing hung tasks with mutex, + * semaphore, etc. + * + * Usage: Load this module and read `/hung_task/mutex`, + * `/hung_task/semaphore`, etc., with 2 or more processes. + * + * This is for testing kernel hung_task error messages with various locking + * mechanisms (e.g., mutex, semaphore, etc.). Note that this may freeze + * your system or cause a panic. Use only for testing purposes. + */ + +#include +#include +#include +#include +#include +#include + +#define HUNG_TASK_DIR "hung_task" +#define HUNG_TASK_MUTEX_FILE "mutex" +#define HUNG_TASK_SEM_FILE "semaphore" +#define SLEEP_SECOND 256 + +static const char dummy_string[] = "This is a dummy string."; +static DEFINE_MUTEX(dummy_mutex); +static DEFINE_SEMAPHORE(dummy_sem, 1); +static struct dentry *hung_task_dir; + +/* Mutex-based read function */ +static ssize_t read_dummy_mutex(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + /* Second task waits on mutex, entering uninterruptible sleep */ + guard(mutex)(&dummy_mutex); + + /* First task sleeps here, interruptible */ + msleep_interruptible(SLEEP_SECOND * 1000); + + return simple_read_from_buffer(user_buf, count, ppos, dummy_string, + sizeof(dummy_string)); +} + +/* Semaphore-based read function */ +static ssize_t read_dummy_semaphore(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + /* Second task waits on semaphore, entering uninterruptible sleep */ + down(&dummy_sem); + + /* First task sleeps here, interruptible */ + msleep_interruptible(SLEEP_SECOND * 1000); + + up(&dummy_sem); + + return simple_read_from_buffer(user_buf, count, ppos, dummy_string, + sizeof(dummy_string)); +} + +/* File operations for mutex */ +static const struct file_operations hung_task_mutex_fops = { + .read = read_dummy_mutex, +}; + +/* File operations for semaphore */ +static const struct file_operations hung_task_sem_fops = { + .read = read_dummy_semaphore, +}; + +static int __init hung_task_tests_init(void) +{ + hung_task_dir = debugfs_create_dir(HUNG_TASK_DIR, NULL); + if (IS_ERR(hung_task_dir)) + return PTR_ERR(hung_task_dir); + + /* Create debugfs files for mutex and semaphore tests */ + debugfs_create_file(HUNG_TASK_MUTEX_FILE, 0400, hung_task_dir, NULL, + &hung_task_mutex_fops); + debugfs_create_file(HUNG_TASK_SEM_FILE, 0400, hung_task_dir, NULL, + &hung_task_sem_fops); + + return 0; +} + +static void __exit hung_task_tests_exit(void) +{ + debugfs_remove_recursive(hung_task_dir); +} + +module_init(hung_task_tests_init); +module_exit(hung_task_tests_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Masami Hiramatsu "); +MODULE_AUTHOR("Zi Li "); +MODULE_DESCRIPTION("Simple sleep under lock files for testing hung task"); -- cgit v1.2.3-59-g8ed1b From 7d9b05277ae89b9306f9d71ec35a03ee59508334 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 8 Apr 2025 12:34:07 +0200 Subject: ocfs2: simplify return statement in ocfs2_filecheck_attr_store() Don't negate 'ret' and simplify the return statement. No functional changes intended. Signed-off-by: Thorsten Blum Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/filecheck.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 1ad7106741f8..3ad7baf67658 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -505,5 +505,5 @@ static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, ocfs2_filecheck_handle_entry(ent, entry); exit: - return (!ret ? count : ret); + return ret ?: count; } -- cgit v1.2.3-59-g8ed1b From 8d1d4b538bb12a858fa95a073c4aff31e79088ee Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 16 Apr 2025 10:06:13 -0600 Subject: scatterlist: inline sg_next() sg_next() is a short function called frequently in I/O paths. Define it in the header file so it can be inlined into its callers. Link: https://lkml.kernel.org/r/20250416160615.3571958-1-csander@purestorage.com Signed-off-by: Caleb Sander Mateos Cc: Eric Biggers Signed-off-by: Andrew Morton --- include/linux/scatterlist.h | 23 ++++++++++++++++++++++- lib/scatterlist.c | 23 ----------------------- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 138e2f1bd08f..0cdbfc42f153 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -94,6 +94,28 @@ static inline bool sg_is_last(struct scatterlist *sg) return __sg_flags(sg) & SG_END; } +/** + * sg_next - return the next scatterlist entry in a list + * @sg: The current sg entry + * + * Description: + * Usually the next entry will be @sg@ + 1, but if this sg element is part + * of a chained scatterlist, it could jump to the start of a new + * scatterlist array. + * + **/ +static inline struct scatterlist *sg_next(struct scatterlist *sg) +{ + if (sg_is_last(sg)) + return NULL; + + sg++; + if (unlikely(sg_is_chain(sg))) + sg = sg_chain_ptr(sg); + + return sg; +} + /** * sg_assign_page - Assign a given page to an SG entry * @sg: SG entry @@ -418,7 +440,6 @@ static inline void sg_init_marker(struct scatterlist *sgl, int sg_nents(struct scatterlist *sg); int sg_nents_for_len(struct scatterlist *sg, u64 len); -struct scatterlist *sg_next(struct scatterlist *); struct scatterlist *sg_last(struct scatterlist *s, unsigned int); void sg_init_table(struct scatterlist *, unsigned int); void sg_init_one(struct scatterlist *, const void *, unsigned int); diff --git a/lib/scatterlist.c b/lib/scatterlist.c index b58d5ef1a34b..7582dfab7fe3 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -13,29 +13,6 @@ #include #include -/** - * sg_next - return the next scatterlist entry in a list - * @sg: The current sg entry - * - * Description: - * Usually the next entry will be @sg@ + 1, but if this sg element is part - * of a chained scatterlist, it could jump to the start of a new - * scatterlist array. - * - **/ -struct scatterlist *sg_next(struct scatterlist *sg) -{ - if (sg_is_last(sg)) - return NULL; - - sg++; - if (unlikely(sg_is_chain(sg))) - sg = sg_chain_ptr(sg); - - return sg; -} -EXPORT_SYMBOL(sg_next); - /** * sg_nents - return total count of entries in scatterlist * @sg: The scatterlist -- cgit v1.2.3-59-g8ed1b From b7df1f254e1a326e6c55698b76bf407df4ec6b6c Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 19 Apr 2025 21:30:11 +0100 Subject: rapidio: remove some dead defines Patch series "rapidio deadcoding". A couple of rapidio deadcoding patches. The first of these is a repost and was originally posted almost a year ago https://lore.kernel.org/all/20240528002515.211366-1-linux@treblig.org/ but got no answer. Other than being rebased and a typo fixed, it's not changed. This patch (of 2): 'mport_dma_buf', 'rio_mport_dma_map' and 'MPORT_MAX_DMA_BUFS' were added in the original commit e8de370188d0 ("rapidio: add mport char device driver") but never used. 'rio_cm_work' was unused since the original commit b6e8d4aa1110 ("rapidio: add RapidIO channelized messaging driver") but never used. Remove them. Link: https://lkml.kernel.org/r/20250419203012.429787-1-linux@treblig.org Link: https://lkml.kernel.org/r/20250419203012.429787-2-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Cc: Alexandre Bounine Cc: Matt Porter Signed-off-by: Andrew Morton --- drivers/rapidio/devices/rio_mport_cdev.c | 20 -------------------- drivers/rapidio/rio_cm.c | 6 ------ 2 files changed, 26 deletions(-) diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index cbf531d0ba68..995cfeca972b 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -97,18 +97,6 @@ module_param(dbg_level, uint, S_IWUSR | S_IWGRP | S_IRUGO); MODULE_PARM_DESC(dbg_level, "Debugging output level (default 0 = none)"); #endif -/* - * An internal DMA coherent buffer - */ -struct mport_dma_buf { - void *ib_base; - dma_addr_t ib_phys; - u32 ib_size; - u64 ib_rio_base; - bool ib_map; - struct file *filp; -}; - /* * Internal memory mapping structure */ @@ -131,14 +119,6 @@ struct rio_mport_mapping { struct file *filp; }; -struct rio_mport_dma_map { - int valid; - u64 length; - void *vaddr; - dma_addr_t paddr; -}; - -#define MPORT_MAX_DMA_BUFS 16 #define MPORT_EVENT_DEPTH 10 /* diff --git a/drivers/rapidio/rio_cm.c b/drivers/rapidio/rio_cm.c index 9135227301c8..97287e838ce1 100644 --- a/drivers/rapidio/rio_cm.c +++ b/drivers/rapidio/rio_cm.c @@ -198,12 +198,6 @@ struct cm_peer { struct rio_dev *rdev; }; -struct rio_cm_work { - struct work_struct work; - struct cm_dev *cm; - void *data; -}; - struct conn_req { struct list_head node; u32 destid; /* requester destID */ -- cgit v1.2.3-59-g8ed1b From ba8182d44b4e557e2dc34e51a88105ce5b083372 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 19 Apr 2025 21:30:12 +0100 Subject: rapidio: remove unused functions rio_request_dma() and rio_dma_prep_slave_sg() were added in 2012 by commit e42d98ebe7d7 ("rapidio: add DMA engine support for RIO data transfers") but never used. rio_find_mport() last use was removed in 2013 by commit 9edbc30b434f ("rapidio: update enumerator registration mechanism") rio_unregister_scan() was added in 2013 by commit a11650e11093 ("rapidio: make enumeration/discovery configurable") but never used. Remove them. Link: https://lkml.kernel.org/r/20250419203012.429787-3-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Cc: Alexandre Bounine Cc: Matt Porter Signed-off-by: Andrew Morton --- drivers/rapidio/rio.c | 103 ------------------------------------------------ drivers/rapidio/rio.h | 2 - include/linux/rio_drv.h | 5 --- 3 files changed, 110 deletions(-) diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index 9544b8ee0c96..46daf32ea13b 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -1774,19 +1774,6 @@ struct dma_chan *rio_request_mport_dma(struct rio_mport *mport) } EXPORT_SYMBOL_GPL(rio_request_mport_dma); -/** - * rio_request_dma - request RapidIO capable DMA channel that supports - * specified target RapidIO device. - * @rdev: RIO device associated with DMA transfer - * - * Returns pointer to allocated DMA channel or NULL if failed. - */ -struct dma_chan *rio_request_dma(struct rio_dev *rdev) -{ - return rio_request_mport_dma(rdev->net->hport); -} -EXPORT_SYMBOL_GPL(rio_request_dma); - /** * rio_release_dma - release specified DMA channel * @dchan: DMA channel to release @@ -1834,56 +1821,8 @@ struct dma_async_tx_descriptor *rio_dma_prep_xfer(struct dma_chan *dchan, } EXPORT_SYMBOL_GPL(rio_dma_prep_xfer); -/** - * rio_dma_prep_slave_sg - RapidIO specific wrapper - * for device_prep_slave_sg callback defined by DMAENGINE. - * @rdev: RIO device control structure - * @dchan: DMA channel to configure - * @data: RIO specific data descriptor - * @direction: DMA data transfer direction (TO or FROM the device) - * @flags: dmaengine defined flags - * - * Initializes RapidIO capable DMA channel for the specified data transfer. - * Uses DMA channel private extension to pass information related to remote - * target RIO device. - * - * Returns: pointer to DMA transaction descriptor if successful, - * error-valued pointer or NULL if failed. - */ -struct dma_async_tx_descriptor *rio_dma_prep_slave_sg(struct rio_dev *rdev, - struct dma_chan *dchan, struct rio_dma_data *data, - enum dma_transfer_direction direction, unsigned long flags) -{ - return rio_dma_prep_xfer(dchan, rdev->destid, data, direction, flags); -} -EXPORT_SYMBOL_GPL(rio_dma_prep_slave_sg); - #endif /* CONFIG_RAPIDIO_DMA_ENGINE */ -/** - * rio_find_mport - find RIO mport by its ID - * @mport_id: number (ID) of mport device - * - * Given a RIO mport number, the desired mport is located - * in the global list of mports. If the mport is found, a pointer to its - * data structure is returned. If no mport is found, %NULL is returned. - */ -struct rio_mport *rio_find_mport(int mport_id) -{ - struct rio_mport *port; - - mutex_lock(&rio_mport_list_lock); - list_for_each_entry(port, &rio_mports, node) { - if (port->id == mport_id) - goto found; - } - port = NULL; -found: - mutex_unlock(&rio_mport_list_lock); - - return port; -} - /** * rio_register_scan - enumeration/discovery method registration interface * @mport_id: mport device ID for which fabric scan routine has to be set @@ -1961,48 +1900,6 @@ err_out: } EXPORT_SYMBOL_GPL(rio_register_scan); -/** - * rio_unregister_scan - removes enumeration/discovery method from mport - * @mport_id: mport device ID for which fabric scan routine has to be - * unregistered (RIO_MPORT_ANY = apply to all mports that use - * the specified scan_ops) - * @scan_ops: enumeration/discovery operations structure - * - * Removes enumeration or discovery method assigned to the specified mport - * device. If RIO_MPORT_ANY is specified, removes the specified operations from - * all mports that have them attached. - */ -int rio_unregister_scan(int mport_id, struct rio_scan *scan_ops) -{ - struct rio_mport *port; - struct rio_scan_node *scan; - - pr_debug("RIO: %s for mport_id=%d\n", __func__, mport_id); - - if (mport_id != RIO_MPORT_ANY && mport_id >= RIO_MAX_MPORTS) - return -EINVAL; - - mutex_lock(&rio_mport_list_lock); - - list_for_each_entry(port, &rio_mports, node) - if (port->id == mport_id || - (mport_id == RIO_MPORT_ANY && port->nscan == scan_ops)) - port->nscan = NULL; - - list_for_each_entry(scan, &rio_scans, node) { - if (scan->mport_id == mport_id) { - list_del(&scan->node); - kfree(scan); - break; - } - } - - mutex_unlock(&rio_mport_list_lock); - - return 0; -} -EXPORT_SYMBOL_GPL(rio_unregister_scan); - /** * rio_mport_scan - execute enumeration/discovery on the specified mport * @mport_id: number (ID) of mport device diff --git a/drivers/rapidio/rio.h b/drivers/rapidio/rio.h index f482de0d0370..a0e2a09ddb8e 100644 --- a/drivers/rapidio/rio.h +++ b/drivers/rapidio/rio.h @@ -41,9 +41,7 @@ extern void rio_del_device(struct rio_dev *rdev, enum rio_device_state state); extern int rio_enable_rx_tx_port(struct rio_mport *port, int local, u16 destid, u8 hopcount, u8 port_num); extern int rio_register_scan(int mport_id, struct rio_scan *scan_ops); -extern int rio_unregister_scan(int mport_id, struct rio_scan *scan_ops); extern void rio_attach_device(struct rio_dev *rdev); -extern struct rio_mport *rio_find_mport(int mport_id); extern int rio_mport_scan(int mport_id); /* Structures internal to the RIO core code */ diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h index e49c32b0f394..dd8afe511242 100644 --- a/include/linux/rio_drv.h +++ b/include/linux/rio_drv.h @@ -391,13 +391,8 @@ struct rio_dev *rio_dev_get(struct rio_dev *); void rio_dev_put(struct rio_dev *); #ifdef CONFIG_RAPIDIO_DMA_ENGINE -extern struct dma_chan *rio_request_dma(struct rio_dev *rdev); extern struct dma_chan *rio_request_mport_dma(struct rio_mport *mport); extern void rio_release_dma(struct dma_chan *dchan); -extern struct dma_async_tx_descriptor *rio_dma_prep_slave_sg( - struct rio_dev *rdev, struct dma_chan *dchan, - struct rio_dma_data *data, - enum dma_transfer_direction direction, unsigned long flags); extern struct dma_async_tx_descriptor *rio_dma_prep_xfer( struct dma_chan *dchan, u16 destid, struct rio_dma_data *data, -- cgit v1.2.3-59-g8ed1b From 2a1c6158131f05c228001e1f73304535bc205444 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 19 Apr 2025 00:49:32 +0100 Subject: relay: remove unused relay_late_setup_files The last use of relay_late_setup_files() was removed in 2018 by commit 2b47733045aa ("drm/i915/guc: Merge log relay file and channel creation") Remove it and the helper it used. relay_late_setup_files() was used for eventually registering 'buffer only' channels. With it gone, delete the docs that explain how to do that. Which suggests it should be possible to lose the 'has_base_filename' flags. (Are there any other uses??) Link: https://lkml.kernel.org/r/20250418234932.490863-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Jens Axboe Cc: Al Viro Cc: Andriy Shevchenko Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/filesystems/relay.rst | 10 ---- include/linux/relay.h | 3 - kernel/relay.c | 111 +----------------------------------- 3 files changed, 1 insertion(+), 123 deletions(-) diff --git a/Documentation/filesystems/relay.rst b/Documentation/filesystems/relay.rst index 04ad083cfe62..292ba8492aeb 100644 --- a/Documentation/filesystems/relay.rst +++ b/Documentation/filesystems/relay.rst @@ -301,16 +301,6 @@ user-defined data with a channel, and is immediately available (including in create_buf_file()) via chan->private_data or buf->chan->private_data. -Buffer-only channels --------------------- - -These channels have no files associated and can be created with -relay_open(NULL, NULL, ...). Such channels are useful in scenarios such -as when doing early tracing in the kernel, before the VFS is up. In these -cases, one may open a buffer-only channel and then call -relay_late_setup_files() when the kernel is ready to handle files, -to expose the buffered data to the userspace. - Channel 'modes' --------------- diff --git a/include/linux/relay.h b/include/linux/relay.h index 72b876dd5cb8..b3224111d074 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -159,9 +159,6 @@ struct rchan *relay_open(const char *base_filename, size_t n_subbufs, const struct rchan_callbacks *cb, void *private_data); -extern int relay_late_setup_files(struct rchan *chan, - const char *base_filename, - struct dentry *parent); extern void relay_close(struct rchan *chan); extern void relay_flush(struct rchan *chan); extern void relay_subbufs_consumed(struct rchan *chan, diff --git a/kernel/relay.c b/kernel/relay.c index 5ac7e711e4b6..c0c93a04d4ce 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -452,7 +452,7 @@ int relay_prepare_cpu(unsigned int cpu) /** * relay_open - create a new relay channel - * @base_filename: base name of files to create, %NULL for buffering only + * @base_filename: base name of files to create * @parent: dentry of parent directory, %NULL for root directory or buffer * @subbuf_size: size of sub-buffers * @n_subbufs: number of sub-buffers @@ -465,10 +465,6 @@ int relay_prepare_cpu(unsigned int cpu) * attributes specified. The created channel buffer files * will be named base_filename0...base_filenameN-1. File * permissions will be %S_IRUSR. - * - * If opening a buffer (@parent = NULL) that you later wish to register - * in a filesystem, call relay_late_setup_files() once the @parent dentry - * is available. */ struct rchan *relay_open(const char *base_filename, struct dentry *parent, @@ -540,111 +536,6 @@ struct rchan_percpu_buf_dispatcher { struct dentry *dentry; }; -/* Called in atomic context. */ -static void __relay_set_buf_dentry(void *info) -{ - struct rchan_percpu_buf_dispatcher *p = info; - - relay_set_buf_dentry(p->buf, p->dentry); -} - -/** - * relay_late_setup_files - triggers file creation - * @chan: channel to operate on - * @base_filename: base name of files to create - * @parent: dentry of parent directory, %NULL for root directory - * - * Returns 0 if successful, non-zero otherwise. - * - * Use to setup files for a previously buffer-only channel created - * by relay_open() with a NULL parent dentry. - * - * For example, this is useful for perfomring early tracing in kernel, - * before VFS is up and then exposing the early results once the dentry - * is available. - */ -int relay_late_setup_files(struct rchan *chan, - const char *base_filename, - struct dentry *parent) -{ - int err = 0; - unsigned int i, curr_cpu; - unsigned long flags; - struct dentry *dentry; - struct rchan_buf *buf; - struct rchan_percpu_buf_dispatcher disp; - - if (!chan || !base_filename) - return -EINVAL; - - strscpy(chan->base_filename, base_filename, NAME_MAX); - - mutex_lock(&relay_channels_mutex); - /* Is chan already set up? */ - if (unlikely(chan->has_base_filename)) { - mutex_unlock(&relay_channels_mutex); - return -EEXIST; - } - chan->has_base_filename = 1; - chan->parent = parent; - - if (chan->is_global) { - err = -EINVAL; - buf = *per_cpu_ptr(chan->buf, 0); - if (!WARN_ON_ONCE(!buf)) { - dentry = relay_create_buf_file(chan, buf, 0); - if (dentry && !WARN_ON_ONCE(!chan->is_global)) { - relay_set_buf_dentry(buf, dentry); - err = 0; - } - } - mutex_unlock(&relay_channels_mutex); - return err; - } - - curr_cpu = get_cpu(); - /* - * The CPU hotplug notifier ran before us and created buffers with - * no files associated. So it's safe to call relay_setup_buf_file() - * on all currently online CPUs. - */ - for_each_online_cpu(i) { - buf = *per_cpu_ptr(chan->buf, i); - if (unlikely(!buf)) { - WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); - err = -EINVAL; - break; - } - - dentry = relay_create_buf_file(chan, buf, i); - if (unlikely(!dentry)) { - err = -EINVAL; - break; - } - - if (curr_cpu == i) { - local_irq_save(flags); - relay_set_buf_dentry(buf, dentry); - local_irq_restore(flags); - } else { - disp.buf = buf; - disp.dentry = dentry; - smp_mb(); - /* relay_channels_mutex must be held, so wait. */ - err = smp_call_function_single(i, - __relay_set_buf_dentry, - &disp, 1); - } - if (unlikely(err)) - break; - } - put_cpu(); - mutex_unlock(&relay_channels_mutex); - - return err; -} -EXPORT_SYMBOL_GPL(relay_late_setup_files); - /** * relay_switch_subbuf - switch to a new sub-buffer * @buf: channel buffer -- cgit v1.2.3-59-g8ed1b From 92f3c5a0051d2b56379651522b587ff7309b2606 Mon Sep 17 00:00:00 2001 From: "Herton R. Krzesinski" Date: Fri, 18 Apr 2025 13:50:47 -0300 Subject: lib/test_kmod: do not hardcode/depend on any filesystem Right now test_kmod has hardcoded dependencies on btrfs/xfs. That is not optimal since you end up needing to select/build them, but it is not really required since other fs could be selected for the testing. Also, we can't change the default/driver module used for testing on initialization. Thus make it more generic: introduce two module parameters (start_driver and start_test_fs), which allow to select which modules/fs to use for the testing on test_kmod initialization. Then it's up to the user to select which modules/fs to use for testing based on his config. However, keep test_module as required default. This way, config/modules becomes selectable as when the testing is done from selftests (userspace). While at it, also change trigger_config_run_type, since at module initialization we already set the defaults at __kmod_config_init and should not need to do it again in test_kmod_init(), thus we can avoid to again set test_driver/test_fs. Link: https://lkml.kernel.org/r/20250418165047.702487-1-herton@redhat.com Signed-off-by: Herton R. Krzesinski Reviewed-by: Luis Chambelrain Cc: Daniel Gomez Cc: Nathan Chancellor Cc: Petr Pavlu Cc: Sami Tolvanen Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 6 ---- lib/test_kmod.c | 64 ++++++++++++++++++++----------------- tools/testing/selftests/kmod/config | 5 --- 3 files changed, 34 insertions(+), 41 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f9051ab610d5..f999d7e86023 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2983,13 +2983,7 @@ config TEST_DYNAMIC_DEBUG config TEST_KMOD tristate "kmod stress tester" depends on m - depends on NETDEVICES && NET_CORE && INET # for TUN - depends on BLOCK - depends on PAGE_SIZE_LESS_THAN_256KB # for BTRFS select TEST_LKM - select XFS_FS - select TUN - select BTRFS_FS help Test the kernel's module loading mechanism: kmod. kmod implements support to load modules using the Linux kernel's usermode helper. diff --git a/lib/test_kmod.c b/lib/test_kmod.c index 064ed0fce75a..f0dd092860ea 100644 --- a/lib/test_kmod.c +++ b/lib/test_kmod.c @@ -28,14 +28,20 @@ #define TEST_START_NUM_THREADS 50 #define TEST_START_DRIVER "test_module" -#define TEST_START_TEST_FS "xfs" #define TEST_START_TEST_CASE TEST_KMOD_DRIVER - static bool force_init_test = false; -module_param(force_init_test, bool_enable_only, 0644); +module_param(force_init_test, bool_enable_only, 0444); MODULE_PARM_DESC(force_init_test, "Force kicking a test immediately after driver loads"); +static char *start_driver; +module_param(start_driver, charp, 0444); +MODULE_PARM_DESC(start_driver, + "Module/driver to use for the testing after driver loads"); +static char *start_test_fs; +module_param(start_test_fs, charp, 0444); +MODULE_PARM_DESC(start_test_fs, + "File system to use for the testing after driver loads"); /* * For device allocation / registration @@ -508,6 +514,11 @@ static int __trigger_config_run(struct kmod_test_device *test_dev) case TEST_KMOD_DRIVER: return run_test_driver(test_dev); case TEST_KMOD_FS_TYPE: + if (!config->test_fs) { + dev_warn(test_dev->dev, + "No fs type specified, can't run the test\n"); + return -EINVAL; + } return run_test_fs_type(test_dev); default: dev_warn(test_dev->dev, @@ -721,26 +732,20 @@ static ssize_t config_test_fs_show(struct device *dev, static DEVICE_ATTR_RW(config_test_fs); static int trigger_config_run_type(struct kmod_test_device *test_dev, - enum kmod_test_case test_case, - const char *test_str) + enum kmod_test_case test_case) { - int copied = 0; struct test_config *config = &test_dev->config; mutex_lock(&test_dev->config_mutex); switch (test_case) { case TEST_KMOD_DRIVER: - kfree_const(config->test_driver); - config->test_driver = NULL; - copied = config_copy_test_driver_name(config, test_str, - strlen(test_str)); break; case TEST_KMOD_FS_TYPE: - kfree_const(config->test_fs); - config->test_fs = NULL; - copied = config_copy_test_fs(config, test_str, - strlen(test_str)); + if (!config->test_fs) { + mutex_unlock(&test_dev->config_mutex); + return 0; + } break; default: mutex_unlock(&test_dev->config_mutex); @@ -751,11 +756,6 @@ static int trigger_config_run_type(struct kmod_test_device *test_dev, mutex_unlock(&test_dev->config_mutex); - if (copied <= 0 || copied != strlen(test_str)) { - test_dev->test_is_oom = true; - return -ENOMEM; - } - test_dev->test_is_oom = false; return trigger_config_run(test_dev); @@ -800,19 +800,24 @@ static unsigned int kmod_init_test_thread_limit(void) static int __kmod_config_init(struct kmod_test_device *test_dev) { struct test_config *config = &test_dev->config; + const char *test_start_driver = start_driver ? start_driver : + TEST_START_DRIVER; int ret = -ENOMEM, copied; __kmod_config_free(config); - copied = config_copy_test_driver_name(config, TEST_START_DRIVER, - strlen(TEST_START_DRIVER)); - if (copied != strlen(TEST_START_DRIVER)) + copied = config_copy_test_driver_name(config, test_start_driver, + strlen(test_start_driver)); + if (copied != strlen(test_start_driver)) goto err_out; - copied = config_copy_test_fs(config, TEST_START_TEST_FS, - strlen(TEST_START_TEST_FS)); - if (copied != strlen(TEST_START_TEST_FS)) - goto err_out; + + if (start_test_fs) { + copied = config_copy_test_fs(config, start_test_fs, + strlen(start_test_fs)); + if (copied != strlen(start_test_fs)) + goto err_out; + } config->num_threads = kmod_init_test_thread_limit(); config->test_result = 0; @@ -1178,12 +1183,11 @@ static int __init test_kmod_init(void) * lowering the init level for more fun. */ if (force_init_test) { - ret = trigger_config_run_type(test_dev, - TEST_KMOD_DRIVER, "tun"); + ret = trigger_config_run_type(test_dev, TEST_KMOD_DRIVER); if (WARN_ON(ret)) return ret; - ret = trigger_config_run_type(test_dev, - TEST_KMOD_FS_TYPE, "btrfs"); + + ret = trigger_config_run_type(test_dev, TEST_KMOD_FS_TYPE); if (WARN_ON(ret)) return ret; } diff --git a/tools/testing/selftests/kmod/config b/tools/testing/selftests/kmod/config index 259f4fd6b5e2..1f1e63494af9 100644 --- a/tools/testing/selftests/kmod/config +++ b/tools/testing/selftests/kmod/config @@ -1,7 +1,2 @@ CONFIG_TEST_KMOD=m CONFIG_TEST_LKM=m -CONFIG_XFS_FS=m - -# For the module parameter force_init_test is used -CONFIG_TUN=m -CONFIG_BTRFS_FS=m -- cgit v1.2.3-59-g8ed1b From f0eba23cb70a945ea0947bdbf0706ca00da50d26 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 18 Apr 2025 13:03:31 +0100 Subject: crash: fix spelling mistake "crahskernel" -> "crashkernel" There is a spelling mistake in a pr_warn message. Fix it. Link: https://lkml.kernel.org/r/20250418120331.535086-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Acked-by: Baoquan He Cc: Dave Young Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/crash_reserve.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index aff7c0fdbefa..acb6bf42e30d 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -131,7 +131,7 @@ static int __init parse_crashkernel_mem(char *cmdline, cur++; *crash_base = memparse(cur, &tmp); if (cur == tmp) { - pr_warn("crahskernel: Memory value expected after '@'\n"); + pr_warn("crashkernel: Memory value expected after '@'\n"); return -EINVAL; } } -- cgit v1.2.3-59-g8ed1b From 2e27fa943b74c9fc5fdf93090508c3c722531c66 Mon Sep 17 00:00:00 2001 From: WangYuli Date: Mon, 21 Apr 2025 15:49:10 +0800 Subject: treewide: fix typo "previlege" There are some spelling mistakes of 'previlege' in comments which should be 'privilege'. Fix them and add it to scripts/spelling.txt. The typo in arch/loongarch/kvm/main.c was corrected by a different patch [1] and is therefore not included in this submission. [1]. https://lore.kernel.org/all/20250420142208.2252280-1-wheatfox17@icloud.com/ Link: https://lkml.kernel.org/r/46AD404E411A4BAC+20250421074910.66988-1-wangyuli@uniontech.com Signed-off-by: WangYuli Signed-off-by: Andrew Morton --- drivers/s390/char/vmlogrdr.c | 4 ++-- include/linux/habanalabs/hl_boot_if.h | 2 +- scripts/spelling.txt | 2 ++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/s390/char/vmlogrdr.c b/drivers/s390/char/vmlogrdr.c index dac85294d2f5..e284eea331d7 100644 --- a/drivers/s390/char/vmlogrdr.c +++ b/drivers/s390/char/vmlogrdr.c @@ -255,7 +255,7 @@ static int vmlogrdr_recording(struct vmlogrdr_priv_t * logptr, /* * The recording commands needs to be called with option QID - * for guests that have previlege classes A or B. + * for guests that have privilege classes A or B. * Purging has to be done as separate step, because recording * can't be switched on as long as records are on the queue. * Doing both at the same time doesn't work. @@ -557,7 +557,7 @@ static ssize_t vmlogrdr_purge_store(struct device * dev, /* * The recording command needs to be called with option QID - * for guests that have previlege classes A or B. + * for guests that have privilege classes A or B. * Other guests will not recognize the command and we have to * issue the same command without the QID parameter. */ diff --git a/include/linux/habanalabs/hl_boot_if.h b/include/linux/habanalabs/hl_boot_if.h index d2a9fc96424b..af5fb4ad77eb 100644 --- a/include/linux/habanalabs/hl_boot_if.h +++ b/include/linux/habanalabs/hl_boot_if.h @@ -295,7 +295,7 @@ enum cpu_boot_dev_sts { * Initialized in: linux * * CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN GIC access permission only from - * previleged entity. FW sets this status + * privileged entity. FW sets this status * bit for host. If this bit is set then * GIC can not be accessed from host. * Initialized in: linux diff --git a/scripts/spelling.txt b/scripts/spelling.txt index a290db720b0f..ac94fa1c2415 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -1240,6 +1240,8 @@ prefered||preferred prefferably||preferably prefitler||prefilter preform||perform +previleged||privileged +previlege||privilege premption||preemption prepaired||prepared prepate||prepare -- cgit v1.2.3-59-g8ed1b From 7123dbbef88cfd9f09e8a7899b0911834600cfa3 Mon Sep 17 00:00:00 2001 From: Luo Gengkun Date: Mon, 21 Apr 2025 03:50:21 +0000 Subject: watchdog: fix watchdog may detect false positive of softlockup When updating `watchdog_thresh`, there is a race condition between writing the new `watchdog_thresh` value and stopping the old watchdog timer. If the old timer triggers during this window, it may falsely detect a softlockup due to the old interval and the new `watchdog_thresh` value being used. The problem can be described as follow: # We asuume previous watchdog_thresh is 60, so the watchdog timer is # coming every 24s. echo 10 > /proc/sys/kernel/watchdog_thresh (User space) | +------>+ update watchdog_thresh (We are in kernel now) | | # using old interval and new `watchdog_thresh` +------>+ watchdog hrtimer (irq context: detect softlockup) | | +-------+ | | + softlockup_stop_all To fix this problem, introduce a shadow variable for `watchdog_thresh`. The update to the actual `watchdog_thresh` is delayed until after the old timer is stopped, preventing false positives. The following testcase may help to understand this problem. --------------------------------------------- echo RT_RUNTIME_SHARE > /sys/kernel/debug/sched/features echo -1 > /proc/sys/kernel/sched_rt_runtime_us echo 0 > /sys/kernel/debug/sched/fair_server/cpu3/runtime echo 60 > /proc/sys/kernel/watchdog_thresh taskset -c 3 chrt -r 99 /bin/bash -c "while true;do true; done" & echo 10 > /proc/sys/kernel/watchdog_thresh & --------------------------------------------- The test case above first removes the throttling restrictions for real-time tasks. It then sets watchdog_thresh to 60 and executes a real-time task ,a simple while(1) loop, on cpu3. Consequently, the final command gets blocked because the presence of this real-time thread prevents kworker:3 from being selected by the scheduler. This eventually triggers a softlockup detection on cpu3 due to watchdog_timer_fn operating with inconsistent variable - using both the old interval and the updated watchdog_thresh simultaneously. [nysal@linux.ibm.com: fix the SOFTLOCKUP_DETECTOR=n case] Link: https://lkml.kernel.org/r/20250502111120.282690-1-nysal@linux.ibm.com Link: https://lkml.kernel.org/r/20250421035021.3507649-1-luogengkun@huaweicloud.com Signed-off-by: Luo Gengkun Signed-off-by: Nysal Jan K.A. Cc: Doug Anderson Cc: Joel Granados Cc: Song Liu Cc: Thomas Gleinxer Cc: "Nysal Jan K.A." Cc: Venkat Rao Bagalkote Cc: Signed-off-by: Andrew Morton --- kernel/watchdog.c | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9fa2af9dbf2c..2d283e92be5a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -47,6 +47,7 @@ int __read_mostly watchdog_user_enabled = 1; static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT; static int __read_mostly watchdog_softlockup_user_enabled = 1; int __read_mostly watchdog_thresh = 10; +static int __read_mostly watchdog_thresh_next; static int __read_mostly watchdog_hardlockup_available; struct cpumask watchdog_cpumask __read_mostly; @@ -870,12 +871,20 @@ int lockup_detector_offline_cpu(unsigned int cpu) return 0; } -static void __lockup_detector_reconfigure(void) +static void __lockup_detector_reconfigure(bool thresh_changed) { cpus_read_lock(); watchdog_hardlockup_stop(); softlockup_stop_all(); + /* + * To prevent watchdog_timer_fn from using the old interval and + * the new watchdog_thresh at the same time, which could lead to + * false softlockup reports, it is necessary to update the + * watchdog_thresh after the softlockup is completed. + */ + if (thresh_changed) + watchdog_thresh = READ_ONCE(watchdog_thresh_next); set_sample_period(); lockup_detector_update_enable(); if (watchdog_enabled && watchdog_thresh) @@ -888,7 +897,7 @@ static void __lockup_detector_reconfigure(void) void lockup_detector_reconfigure(void) { mutex_lock(&watchdog_mutex); - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); mutex_unlock(&watchdog_mutex); } @@ -908,27 +917,29 @@ static __init void lockup_detector_setup(void) return; mutex_lock(&watchdog_mutex); - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); softlockup_initialized = true; mutex_unlock(&watchdog_mutex); } #else /* CONFIG_SOFTLOCKUP_DETECTOR */ -static void __lockup_detector_reconfigure(void) +static void __lockup_detector_reconfigure(bool thresh_changed) { cpus_read_lock(); watchdog_hardlockup_stop(); + if (thresh_changed) + watchdog_thresh = READ_ONCE(watchdog_thresh_next); lockup_detector_update_enable(); watchdog_hardlockup_start(); cpus_read_unlock(); } void lockup_detector_reconfigure(void) { - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); } static inline void lockup_detector_setup(void) { - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ @@ -946,11 +957,11 @@ void lockup_detector_soft_poweroff(void) #ifdef CONFIG_SYSCTL /* Propagate any changes to the watchdog infrastructure */ -static void proc_watchdog_update(void) +static void proc_watchdog_update(bool thresh_changed) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(thresh_changed); } /* @@ -984,7 +995,7 @@ static int proc_watchdog_common(int which, const struct ctl_table *table, int wr } else { err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!err && old != READ_ONCE(*param)) - proc_watchdog_update(); + proc_watchdog_update(false); } mutex_unlock(&watchdog_mutex); return err; @@ -1035,11 +1046,13 @@ static int proc_watchdog_thresh(const struct ctl_table *table, int write, mutex_lock(&watchdog_mutex); - old = READ_ONCE(watchdog_thresh); + watchdog_thresh_next = READ_ONCE(watchdog_thresh); + + old = watchdog_thresh_next; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!err && write && old != READ_ONCE(watchdog_thresh)) - proc_watchdog_update(); + if (!err && write && old != READ_ONCE(watchdog_thresh_next)) + proc_watchdog_update(true); mutex_unlock(&watchdog_mutex); return err; @@ -1060,7 +1073,7 @@ static int proc_watchdog_cpumask(const struct ctl_table *table, int write, err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) - proc_watchdog_update(); + proc_watchdog_update(false); mutex_unlock(&watchdog_mutex); return err; @@ -1080,7 +1093,7 @@ static const struct ctl_table watchdog_sysctls[] = { }, { .procname = "watchdog_thresh", - .data = &watchdog_thresh, + .data = &watchdog_thresh_next, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_watchdog_thresh, -- cgit v1.2.3-59-g8ed1b From 3dc32adf98147b36b25dc579bb438c9ea086b1b4 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 22 Apr 2025 14:14:49 +0100 Subject: maccess: fix strncpy_from_user_nofault() empty string handling strncpy_from_user_nofault() should return the length of the copied string including the trailing NUL, but if the argument unsafe_addr points to an empty string ({'\0'}), the return value is 0. This happens as strncpy_from_user() copies terminal symbol into dst and returns 0 (as expected), but strncpy_from_user_nofault does not modify ret as it is not equal to count and not greater than 0, so 0 is returned, which contradicts the contract. Link: https://lkml.kernel.org/r/20250422131449.57177-1-mykyta.yatsenko5@gmail.com Signed-off-by: Mykyta Yatsenko Reviewed-by: Andrii Nakryiko Cc: "Masami Hiramatsu (Google)" Cc: Steven Rostedt Cc: Kees Cook Signed-off-by: Andrew Morton --- mm/maccess.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/maccess.c b/mm/maccess.c index 8f0906180a94..831b4dd7296c 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -196,7 +196,7 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, if (ret >= count) { ret = count; dst[ret - 1] = '\0'; - } else if (ret > 0) { + } else if (ret >= 0) { ret++; } -- cgit v1.2.3-59-g8ed1b From 9c7b53b21fb1df20f85e3822160b4687f98ff7b3 Mon Sep 17 00:00:00 2001 From: Marc Herbert Date: Thu, 24 Apr 2025 19:40:35 +0000 Subject: compiler_types.h: fix "unused variable" in __compiletime_assert() This refines commit c03567a8e8d5 ("include/linux/compiler.h: don't perform compiletime_assert with -O0") and restores #ifdef __OPTIMIZE__ symmetry by evaluating the 'condition' variable in both compile-time variants of __compiletimeassert(). As __OPTIMIZE__ is always true by default, this commit does not change anything by default. But it fixes warnings with _non-default_ CFLAGS like for instance this: make CFLAGS_tcp.o='-Og -U__OPTIMIZE__' from net/ipv4/tcp.c:273: include/net/sch_generic.h: In function `qdisc_cb_private_validate': include/net/sch_generic.h:511:30: error: unused variable `qcb' [-Werror=unused-variable] { struct qdisc_skb_cb *qcb; BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*qcb)); ... } [akpm@linux-foundation.org: regularize comment layout, reflow comment] Link: https://lkml.kernel.org/r/20250424194048.652571-1-marc.herbert@linux.intel.com Signed-off-by: Marc Herbert Cc: Alexander Potapenko Cc: Changbin Du Cc: Jan Hendrik Farr Cc: Macro Elver Cc: Miguel Ojeda Cc: Tony Ambardar Cc: Uros Bizjak Cc: Kees Cook Signed-off-by: Andrew Morton --- include/linux/compiler_types.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 501cffddc2f4..1f2eee8331d3 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -525,6 +525,12 @@ struct ftrace_likely_data { sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) #ifdef __OPTIMIZE__ +/* + * #ifdef __OPTIMIZE__ is only a good approximation; for instance "make + * CFLAGS_foo.o=-Og" defines __OPTIMIZE__, does not elide the conditional code + * and can break compilation with wrong error message(s). Combine with + * -U__OPTIMIZE__ when needed. + */ # define __compiletime_assert(condition, msg, prefix, suffix) \ do { \ /* \ @@ -538,7 +544,7 @@ struct ftrace_likely_data { prefix ## suffix(); \ } while (0) #else -# define __compiletime_assert(condition, msg, prefix, suffix) do { } while (0) +# define __compiletime_assert(condition, msg, prefix, suffix) ((void)(condition)) #endif #define _compiletime_assert(condition, msg, prefix, suffix) \ -- cgit v1.2.3-59-g8ed1b From d66adabe91803ef34a8b90613c81267b5ded1472 Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Thu, 24 Apr 2025 23:33:22 +0900 Subject: ipc: fix to protect IPCS lookups using RCU syzbot reported that it discovered a use-after-free vulnerability, [0] [0]: https://lore.kernel.org/all/67af13f8.050a0220.21dd3.0038.GAE@google.com/ idr_for_each() is protected by rwsem, but this is not enough. If it is not protected by RCU read-critical region, when idr_for_each() calls radix_tree_node_free() through call_rcu() to free the radix_tree_node structure, the node will be freed immediately, and when reading the next node in radix_tree_for_each_slot(), the already freed memory may be read. Therefore, we need to add code to make sure that idr_for_each() is protected within the RCU read-critical region when we call it in shm_destroy_orphaned(). Link: https://lkml.kernel.org/r/20250424143322.18830-1-aha310510@gmail.com Fixes: b34a6b1da371 ("ipc: introduce shm_rmid_forced sysctl") Signed-off-by: Jeongjun Park Reported-by: syzbot+a2b84e569d06ca3a949c@syzkaller.appspotmail.com Cc: Jeongjun Park Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Vasiliy Kulikov Cc: Signed-off-by: Andrew Morton --- ipc/shm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ipc/shm.c b/ipc/shm.c index 99564c870084..492fcc699985 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -431,8 +431,11 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data) void shm_destroy_orphaned(struct ipc_namespace *ns) { down_write(&shm_ids(ns).rwsem); - if (shm_ids(ns).in_use) + if (shm_ids(ns).in_use) { + rcu_read_lock(); idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns); + rcu_read_unlock(); + } up_write(&shm_ids(ns).rwsem); } -- cgit v1.2.3-59-g8ed1b From cdc3ed3035d0fe934aa1d9b78ce256752fd3bb7d Mon Sep 17 00:00:00 2001 From: Murad Masimov Date: Wed, 2 Apr 2025 09:56:27 +0300 Subject: ocfs2: fix possible memory leak in ocfs2_finish_quota_recovery If ocfs2_finish_quota_recovery() exits due to an error before passing all rc_list elements to ocfs2_recover_local_quota_file() then it can lead to a memory leak as rc_list may still contain elements that have to be freed. Release all memory allocated by ocfs2_add_recovery_chunk() using ocfs2_free_quota_recovery() instead of kfree(). Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Link: https://lkml.kernel.org/r/20250402065628.706359-2-m.masimov@mt-integration.ru Fixes: 2205363dce74 ("ocfs2: Implement quota recovery") Signed-off-by: Murad Masimov Reviewed-by: Jan Kara Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/quota_local.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index e272429da3db..de7f12858729 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -674,7 +674,7 @@ out_put: break; } out: - kfree(rec); + ocfs2_free_quota_recovery(rec); return status; } -- cgit v1.2.3-59-g8ed1b From 1785c67e2adc4d1899685b36dd10f7ebbf117178 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 22 Apr 2025 15:30:51 +0800 Subject: ocfs2: remove unnecessary NULL check before unregister_sysctl_table() unregister_sysctl_table() checks for NULL pointers internally. Remove unneeded NULL check here. Link: https://lkml.kernel.org/r/20250422073051.1334310-1-nichen@iscas.ac.cn Signed-off-by: Chen Ni Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/stackglue.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index ddd761cf44c8..a28c127b9934 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -691,8 +691,7 @@ static void __exit ocfs2_stack_glue_exit(void) memset(&locking_max_version, 0, sizeof(struct ocfs2_protocol_version)); ocfs2_sysfs_exit(); - if (ocfs2_table_header) - unregister_sysctl_table(ocfs2_table_header); + unregister_sysctl_table(ocfs2_table_header); } MODULE_AUTHOR("Oracle"); -- cgit v1.2.3-59-g8ed1b From f3def8270c67217efb0e23dcd54714f74de3b6b2 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Sun, 27 Apr 2025 23:14:49 +0300 Subject: sort.h: hoist cmp_int() into generic header file Deduplicate the same functionality implemented in several places by moving the cmp_int() helper macro into linux/sort.h. The macro performs a three-way comparison of the arguments mostly useful in different sorting strategies and algorithms. Link: https://lkml.kernel.org/r/20250427201451.900730-1-pchelkin@ispras.ru Signed-off-by: Fedor Pchelkin Suggested-by: Darrick J. Wong Acked-by: Kent Overstreet Acked-by: Coly Li Cc: Al Viro Cc: Carlos Maiolino Cc: Christian Brauner Cc: Coly Li Cc: Jan Kara Signed-off-by: Andrew Morton --- drivers/md/bcache/btree.c | 3 +-- fs/bcachefs/util.h | 3 +-- fs/pipe.c | 3 +-- fs/xfs/xfs_zone_gc.c | 2 -- include/linux/sort.h | 10 ++++++++++ 5 files changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index ed40d8600656..2cc2eb24dc8a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -36,6 +36,7 @@ #include #include #include +#include #include /* @@ -559,8 +560,6 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) } } -#define cmp_int(l, r) ((l > r) - (l < r)) - #ifdef CONFIG_PROVE_LOCKING static int btree_lock_cmp_fn(const struct lockdep_map *_a, const struct lockdep_map *_b) diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 3e52c7f8ddd2..7ec1fc8b46f9 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -669,8 +670,6 @@ static inline void percpu_memset(void __percpu *p, int c, size_t bytes) u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); -#define cmp_int(l, r) ((l > r) - (l < r)) - static inline int u8_cmp(u8 l, u8 r) { return cmp_int(l, r); diff --git a/fs/pipe.c b/fs/pipe.c index da45edd68c41..45077c37bad1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -76,8 +77,6 @@ static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; * -- Manfred Spraul 2002-05-09 */ -#define cmp_int(l, r) ((l > r) - (l < r)) - #ifdef CONFIG_PROVE_LOCKING static int pipe_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index 81c94dd1d596..2f9caa3eb828 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -290,8 +290,6 @@ xfs_zone_gc_query_cb( return 0; } -#define cmp_int(l, r) ((l > r) - (l < r)) - static int xfs_zone_gc_rmap_rec_cmp( const void *a, diff --git a/include/linux/sort.h b/include/linux/sort.h index 8e5603b10941..c01ef804a0eb 100644 --- a/include/linux/sort.h +++ b/include/linux/sort.h @@ -4,6 +4,16 @@ #include +/** + * cmp_int - perform a three-way comparison of the arguments + * @l: the left argument + * @r: the right argument + * + * Return: 1 if the left argument is greater than the right one; 0 if the + * arguments are equal; -1 if the left argument is less than the right one. + */ +#define cmp_int(l, r) (((l) > (r)) - ((l) < (r))) + void sort_r(void *base, size_t num, size_t size, cmp_r_func_t cmp_func, swap_r_func_t swap_func, -- cgit v1.2.3-59-g8ed1b From c91d78622e16f628176d7248044106b03818af6d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 28 Apr 2025 10:27:37 +0300 Subject: util_macros.h: fix the reference in kernel-doc In PTR_IF() description the text refers to the parameter as (ptr) while the kernel-doc format asks for @ptr. Fix this accordingly. Link: https://lkml.kernel.org/r/20250428072737.3265239-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Alexandru Ardelean Signed-off-by: Andrew Morton --- include/linux/util_macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h index 7b64fb597f85..8183ac610cc5 100644 --- a/include/linux/util_macros.h +++ b/include/linux/util_macros.h @@ -85,7 +85,7 @@ * @ptr: A pointer to assign if @cond is true. * * PTR_IF(IS_ENABLED(CONFIG_FOO), ptr) evaluates to @ptr if CONFIG_FOO is set - * to 'y' or 'm', or to NULL otherwise. The (ptr) argument must be a pointer. + * to 'y' or 'm', or to NULL otherwise. The @ptr argument must be a pointer. * * The macro can be very useful to help compiler dropping dead code. * -- cgit v1.2.3-59-g8ed1b From f7a667a046cf9962b0b3eb763b1fbe3eec925a2c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 28 Apr 2025 11:57:20 -0700 Subject: kexec_file: use SHA-256 library API instead of crypto_shash API This user of SHA-256 does not support any other algorithm, so the crypto_shash abstraction provides no value. Just use the SHA-256 library API instead, which is much simpler and easier to use. Tested with '/sbin/kexec --kexec-file-syscall'. Link: https://lkml.kernel.org/r/20250428185721.844686-1-ebiggers@kernel.org Signed-off-by: Eric Biggers Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Young Signed-off-by: Andrew Morton --- kernel/Kconfig.kexec | 3 +- kernel/kexec_file.c | 78 ++++++++++------------------------------------------ 2 files changed, 16 insertions(+), 65 deletions(-) diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 4d111f871951..9fea9dca15bd 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -38,8 +38,7 @@ config KEXEC config KEXEC_FILE bool "Enable kexec file based system call" depends on ARCH_SUPPORTS_KEXEC_FILE - select CRYPTO - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA256 select KEXEC_CORE help This is new version of kexec system call. This system call is diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index fba686487e3b..14e5087b4277 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -712,11 +711,10 @@ int kexec_add_buffer(struct kexec_buf *kbuf) /* Calculate and store the digest of segments */ static int kexec_calculate_store_digests(struct kimage *image) { - struct crypto_shash *tfm; - struct shash_desc *desc; + struct sha256_state state; int ret = 0, i, j, zero_buf_sz, sha_region_sz; - size_t desc_size, nullsz; - char *digest; + size_t nullsz; + u8 digest[SHA256_DIGEST_SIZE]; void *zero_buf; struct kexec_sha_region *sha_regions; struct purgatory_info *pi = &image->purgatory_info; @@ -727,37 +725,12 @@ static int kexec_calculate_store_digests(struct kimage *image) zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); zero_buf_sz = PAGE_SIZE; - tfm = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(tfm)) { - ret = PTR_ERR(tfm); - goto out; - } - - desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); - desc = kzalloc(desc_size, GFP_KERNEL); - if (!desc) { - ret = -ENOMEM; - goto out_free_tfm; - } - sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); sha_regions = vzalloc(sha_region_sz); - if (!sha_regions) { - ret = -ENOMEM; - goto out_free_desc; - } - - desc->tfm = tfm; - - ret = crypto_shash_init(desc); - if (ret < 0) - goto out_free_sha_regions; + if (!sha_regions) + return -ENOMEM; - digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); - if (!digest) { - ret = -ENOMEM; - goto out_free_sha_regions; - } + sha256_init(&state); for (j = i = 0; i < image->nr_segments; i++) { struct kexec_segment *ksegment; @@ -776,10 +749,7 @@ static int kexec_calculate_store_digests(struct kimage *image) if (ksegment->kbuf == pi->purgatory_buf) continue; - ret = crypto_shash_update(desc, ksegment->kbuf, - ksegment->bufsz); - if (ret) - break; + sha256_update(&state, ksegment->kbuf, ksegment->bufsz); /* * Assume rest of the buffer is filled with zero and @@ -791,44 +761,26 @@ static int kexec_calculate_store_digests(struct kimage *image) if (bytes > zero_buf_sz) bytes = zero_buf_sz; - ret = crypto_shash_update(desc, zero_buf, bytes); - if (ret) - break; + sha256_update(&state, zero_buf, bytes); nullsz -= bytes; } - if (ret) - break; - sha_regions[j].start = ksegment->mem; sha_regions[j].len = ksegment->memsz; j++; } - if (!ret) { - ret = crypto_shash_final(desc, digest); - if (ret) - goto out_free_digest; - ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions", - sha_regions, sha_region_sz, 0); - if (ret) - goto out_free_digest; + sha256_final(&state, digest); - ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest", - digest, SHA256_DIGEST_SIZE, 0); - if (ret) - goto out_free_digest; - } + ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions", + sha_regions, sha_region_sz, 0); + if (ret) + goto out_free_sha_regions; -out_free_digest: - kfree(digest); + ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); out_free_sha_regions: vfree(sha_regions); -out_free_desc: - kfree(desc); -out_free_tfm: - kfree(tfm); -out: return ret; } -- cgit v1.2.3-59-g8ed1b From f43f02429295486059605997bc43803527d69791 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Tue, 29 Apr 2025 02:37:07 +0900 Subject: nilfs2: add pointer check for nilfs_direct_propagate() Patch series "nilfs2: improve sanity checks in dirty state propagation". This fixes one missed check for block mapping anomalies and one improper return of an error code during a preparation step for log writing, thereby improving checking for filesystem corruption on writeback. This patch (of 2): In nilfs_direct_propagate(), the printer get from nilfs_direct_get_ptr() need to be checked to ensure it is not an invalid pointer. If the pointer value obtained by nilfs_direct_get_ptr() is NILFS_BMAP_INVALID_PTR, means that the metadata (in this case, i_bmap in the nilfs_inode_info struct) that should point to the data block at the buffer head of the argument is corrupted and the data block is orphaned, meaning that the file system has lost consistency. Add a value check and return -EINVAL when it is an invalid pointer. Link: https://lkml.kernel.org/r/20250428173808.6452-1-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/20250428173808.6452-2-konishi.ryusuke@gmail.com Fixes: 36a580eb489f ("nilfs2: direct block mapping") Signed-off-by: Wentao Liang Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/direct.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index 893ab36824cc..2d8dc6b35b54 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -273,6 +273,9 @@ static int nilfs_direct_propagate(struct nilfs_bmap *bmap, dat = nilfs_bmap_get_dat(bmap); key = nilfs_bmap_data_get_key(bmap, bh); ptr = nilfs_direct_get_ptr(bmap, key); + if (ptr == NILFS_BMAP_INVALID_PTR) + return -EINVAL; + if (!buffer_nilfs_volatile(bh)) { oldreq.pr_entry_nr = ptr; newreq.pr_entry_nr = ptr; -- cgit v1.2.3-59-g8ed1b From 8e39fbb1edbb4ec9d7c1124f403877fc167fcecd Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Tue, 29 Apr 2025 02:37:08 +0900 Subject: nilfs2: do not propagate ENOENT error from nilfs_btree_propagate() In preparation for writing logs, in nilfs_btree_propagate(), which makes parent and ancestor node blocks dirty starting from a modified data block or b-tree node block, if the starting block does not belong to the b-tree, i.e. is isolated, nilfs_btree_do_lookup() called within the function fails with -ENOENT. In this case, even though -ENOENT is an internal code, it is propagated to the log writer via nilfs_bmap_propagate() and may be erroneously returned to system calls such as fsync(). Fix this issue by changing the error code to -EINVAL in this case, and having the bmap layer detect metadata corruption and convert the error code appropriately. Link: https://lkml.kernel.org/r/20250428173808.6452-3-konishi.ryusuke@gmail.com Fixes: 1f5abe7e7dbc ("nilfs2: replace BUG_ON and BUG calls triggerable from ioctl") Signed-off-by: Ryusuke Konishi Cc: Wentao Liang Signed-off-by: Andrew Morton --- fs/nilfs2/btree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 0d8f7fb15c2e..dd0c8e560ef6 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -2102,11 +2102,13 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree, ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0); if (ret < 0) { - if (unlikely(ret == -ENOENT)) + if (unlikely(ret == -ENOENT)) { nilfs_crit(btree->b_inode->i_sb, "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d", btree->b_inode->i_ino, (unsigned long long)key, level); + ret = -EINVAL; + } goto out; } -- cgit v1.2.3-59-g8ed1b From 479d26ee013c5388ad6855e512ab20d81e43750d Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 1 May 2025 02:05:02 +0100 Subject: lib/oid_registry.c: remove unused sprint_OID sprint_OID() was added as part of 2012's commit 4f73175d0375 ("X.509: Add utility functions to render OIDs as strings") but it hasn't been used. Remove it. Note that there's also 'sprint_oid' (lower case) which is used in a lot of places; that's left as is except for fixing its case in a comment. Link: https://lkml.kernel.org/r/20250501010502.326472-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Andrew Morton --- include/linux/oid_registry.h | 1 - lib/oid_registry.c | 25 +------------------------ 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h index 6f9242259edc..6de479ebbe5d 100644 --- a/include/linux/oid_registry.h +++ b/include/linux/oid_registry.h @@ -151,6 +151,5 @@ enum OID { extern enum OID look_up_OID(const void *data, size_t datasize); extern int parse_OID(const void *data, size_t datasize, enum OID *oid); extern int sprint_oid(const void *, size_t, char *, size_t); -extern int sprint_OID(enum OID, char *, size_t); #endif /* _LINUX_OID_REGISTRY_H */ diff --git a/lib/oid_registry.c b/lib/oid_registry.c index fe6705cfd780..9b757a117f09 100644 --- a/lib/oid_registry.c +++ b/lib/oid_registry.c @@ -117,7 +117,7 @@ int parse_OID(const void *data, size_t datasize, enum OID *oid) EXPORT_SYMBOL_GPL(parse_OID); /* - * sprint_OID - Print an Object Identifier into a buffer + * sprint_oid - Print an Object Identifier into a buffer * @data: The encoded OID to print * @datasize: The size of the encoded OID * @buffer: The buffer to render into @@ -173,26 +173,3 @@ bad: return -EBADMSG; } EXPORT_SYMBOL_GPL(sprint_oid); - -/** - * sprint_OID - Print an Object Identifier into a buffer - * @oid: The OID to print - * @buffer: The buffer to render into - * @bufsize: The size of the buffer - * - * The OID is rendered into the buffer in "a.b.c.d" format and the number of - * bytes is returned. - */ -int sprint_OID(enum OID oid, char *buffer, size_t bufsize) -{ - int ret; - - BUG_ON(oid >= OID__NR); - - ret = sprint_oid(oid_data + oid_index[oid], - oid_index[oid + 1] - oid_index[oid], - buffer, bufsize); - BUG_ON(ret == -EBADMSG); - return ret; -} -EXPORT_SYMBOL_GPL(sprint_OID); -- cgit v1.2.3-59-g8ed1b From f11c1efe46ad84555a0948401c7bdb63d711088d Mon Sep 17 00:00:00 2001 From: Chelsy Ratnawat Date: Sat, 3 May 2025 14:19:59 -0700 Subject: selftests: fix some typos in tools/testing/selftests Fix multiple spelling errors: - "rougly" -> "roughly" - "fielesystems" -> "filesystems" - "Can'" -> "Can't" Link: https://lkml.kernel.org/r/20250503211959.507815-1-chelsyratnawat2001@gmail.com Signed-off-by: Chelsy Ratnawat Cc: Christian Brauner Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/filesystems/file_stressor.c | 2 +- tools/testing/selftests/mm/gup_longterm.c | 2 +- tools/testing/selftests/thermal/intel/power_floor/power_floor_test.c | 2 +- .../selftests/thermal/intel/workload_hint/workload_hint_test.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/filesystems/file_stressor.c b/tools/testing/selftests/filesystems/file_stressor.c index 1136f93a9977..01dd89f8e52f 100644 --- a/tools/testing/selftests/filesystems/file_stressor.c +++ b/tools/testing/selftests/filesystems/file_stressor.c @@ -156,7 +156,7 @@ TEST_F_TIMEOUT(file_stressor, slab_typesafe_by_rcu, 900 * 2) ssize_t nr_read; /* - * Concurrently read /proc//fd/ which rougly does: + * Concurrently read /proc//fd/ which roughly does: * * f = fget_task_next(p, &fd); * if (!f) diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c index 21595b20bbc3..d50ada0c7dbf 100644 --- a/tools/testing/selftests/mm/gup_longterm.c +++ b/tools/testing/selftests/mm/gup_longterm.c @@ -158,7 +158,7 @@ static void do_test(int fd, size_t size, enum test_type type, bool shared) /* * R/O pinning or pinning in a private mapping is always * expected to work. Otherwise, we expect long-term R/W pinning - * to only succeed for special fielesystems. + * to only succeed for special filesystems. */ should_work = !shared || !rw || fs_supports_writable_longterm_pinning(fs_type); diff --git a/tools/testing/selftests/thermal/intel/power_floor/power_floor_test.c b/tools/testing/selftests/thermal/intel/power_floor/power_floor_test.c index 0326b39a11b9..30cab5d425d2 100644 --- a/tools/testing/selftests/thermal/intel/power_floor/power_floor_test.c +++ b/tools/testing/selftests/thermal/intel/power_floor/power_floor_test.c @@ -56,7 +56,7 @@ int main(int argc, char **argv) } if (write(fd, "1\n", 2) < 0) { - perror("Can' enable power floor notifications\n"); + perror("Can't enable power floor notifications\n"); exit(1); } diff --git a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c index 217c3a641c53..a40097232967 100644 --- a/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c +++ b/tools/testing/selftests/thermal/intel/workload_hint/workload_hint_test.c @@ -37,7 +37,7 @@ void workload_hint_exit(int signum) } if (write(fd, "0\n", 2) < 0) { - perror("Can' disable workload hints\n"); + perror("Can't disable workload hints\n"); exit(1); } @@ -99,7 +99,7 @@ int main(int argc, char **argv) } if (write(fd, "1\n", 2) < 0) { - perror("Can' enable workload hints\n"); + perror("Can't enable workload hints\n"); exit(1); } -- cgit v1.2.3-59-g8ed1b From 6be7045c7756caf2d445dc66473e45ac5779cd55 Mon Sep 17 00:00:00 2001 From: Illia Ostapyshyn Date: Sat, 3 May 2025 14:32:31 +0200 Subject: scripts/gdb: fix kgdb probing on single-core systems Patch series "scripts/gdb: Fixes related to lx_per_cpu()". These patches (1) fix kgdb detection on systems featuring a single CPU and (2) update the documentation to reflect the current usage of lx_per_cpu() and update an outdated example of its usage. This patch (of 2): When requested the list of threads via qfThreadInfo, gdb_cmd_query in kernel/debug/gdbstub.c first returns "shadow" threads for CPUs followed by the actual tasks in the system. Extended qThreadExtraInfo queries yield "shadowCPU%d" as the name for the CPU core threads. This behavior is used by get_gdbserver_type() to probe for KGDB by matching the name for the thread 2 against "shadowCPU". This breaks down on single-core systems, where thread 2 is the first nonshadow thread. Request the name for thread 1 instead. As GDB assigns thread IDs in the order of their appearance, it is safe to assume shadowCPU0 at ID 1 as long as CPU0 is not hotplugged. Before: (gdb) info threads Id Target Id Frame 1 Thread 4294967294 (shadowCPU0) kgdb_breakpoint () * 2 Thread 1 (swapper/0) kgdb_breakpoint () 3 Thread 2 (kthreadd) 0x0000000000000000 in ?? () ... (gdb) p $lx_current().comm Sorry, obtaining the current CPU is not yet supported with this gdb server. After: (gdb) info threads Id Target Id Frame 1 Thread 4294967294 (shadowCPU0) kgdb_breakpoint () * 2 Thread 1 (swapper/0) kgdb_breakpoint () 3 Thread 2 (kthreadd) 0x0000000000000000 in ?? () ... (gdb) p $lx_current().comm $1 = "swapper/0\000\000\000\000\000\000" Link: https://lkml.kernel.org/r/20250503123234.2407184-1-illia@yshyn.com Link: https://lkml.kernel.org/r/20250503123234.2407184-2-illia@yshyn.com Signed-off-by: Illia Ostapyshyn Cc: Alex Shi Cc: Brendan Jackman Cc: Dongliang Mu Cc: Florian Rommel Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jan Kiszka Cc: Jonathan Corbet Cc: Kieran Bingham Cc: Yanteng Si Signed-off-by: Andrew Morton --- scripts/gdb/linux/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gdb/linux/utils.py b/scripts/gdb/linux/utils.py index 03ebdccf5f69..877404e92dbb 100644 --- a/scripts/gdb/linux/utils.py +++ b/scripts/gdb/linux/utils.py @@ -200,7 +200,7 @@ def get_gdbserver_type(): def probe_kgdb(): try: - thread_info = gdb.execute("info thread 2", to_string=True) + thread_info = gdb.execute("info thread 1", to_string=True) return "shadowCPU" in thread_info except gdb.error: return False -- cgit v1.2.3-59-g8ed1b From 09e1d93a421fa5f6699a9dbdbfc09c1008de4b9a Mon Sep 17 00:00:00 2001 From: Illia Ostapyshyn Date: Sat, 3 May 2025 14:32:32 +0200 Subject: scripts/gdb: update documentation for lx_per_cpu Commit db08c53fdd542bb7f83b ("scripts/gdb: fix parameter handling in $lx_per_cpu") changed the parameter handling of lx_per_cpu to use GdbValue instead of parsing the variable name. Update the documentation to reflect the new lx_per_cpu usage. Update the hrtimer_bases example to use rb_tree instead of the timerqueue_head.next pointer removed in commit 511885d7061eda3eb1fa ("lib/timerqueue: Rely on rbtree semantics for next timer"). Link: https://lkml.kernel.org/r/20250503123234.2407184-3-illia@yshyn.com Signed-off-by: Illia Ostapyshyn Cc: Alex Shi Cc: Brendan Jackman Cc: Dongliang Mu Cc: Florian Rommel Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jan Kiszka Cc: Jonathan Corbet Cc: Kieran Bingham Cc: Yanteng Si Signed-off-by: Andrew Morton --- .../process/debugging/gdb-kernel-debugging.rst | 34 ++++++++++------------ .../zh_CN/dev-tools/gdb-kernel-debugging.rst | 34 ++++++++++------------ .../zh_TW/dev-tools/gdb-kernel-debugging.rst | 34 ++++++++++------------ scripts/gdb/linux/cpus.py | 4 +-- 4 files changed, 47 insertions(+), 59 deletions(-) diff --git a/Documentation/process/debugging/gdb-kernel-debugging.rst b/Documentation/process/debugging/gdb-kernel-debugging.rst index 895285c037c7..9475c759c722 100644 --- a/Documentation/process/debugging/gdb-kernel-debugging.rst +++ b/Documentation/process/debugging/gdb-kernel-debugging.rst @@ -127,35 +127,31 @@ Examples of using the Linux-provided gdb helpers - Make use of the per-cpu function for the current or a specified CPU:: - (gdb) p $lx_per_cpu("runqueues").nr_running + (gdb) p $lx_per_cpu(runqueues).nr_running $3 = 1 - (gdb) p $lx_per_cpu("runqueues", 2).nr_running + (gdb) p $lx_per_cpu(runqueues, 2).nr_running $4 = 0 - Dig into hrtimers using the container_of helper:: - (gdb) set $next = $lx_per_cpu("hrtimer_bases").clock_base[0].active.next - (gdb) p *$container_of($next, "struct hrtimer", "node") + (gdb) set $leftmost = $lx_per_cpu(hrtimer_bases).clock_base[0].active.rb_root.rb_leftmost + (gdb) p *$container_of($leftmost, "struct hrtimer", "node") $5 = { node = { node = { - __rb_parent_color = 18446612133355256072, - rb_right = 0x0 , - rb_left = 0x0 + __rb_parent_color = 18446612686384860673, + rb_right = 0xffff888231da8b00, + rb_left = 0x0 }, - expires = { - tv64 = 1835268000000 - } + expires = 1228461000000 }, - _softexpires = { - tv64 = 1835268000000 - }, - function = 0xffffffff81078232 , - base = 0xffff88003fd0d6f0, - state = 1, - start_pid = 0, - start_site = 0xffffffff81055c1f , - start_comm = "swapper/2\000\000\000\000\000\000" + _softexpires = 1228461000000, + function = 0xffffffff8137ab20 , + base = 0xffff888231d9b4c0, + state = 1 '\001', + is_rel = 0 '\000', + is_soft = 0 '\000', + is_hard = 1 '\001' } diff --git a/Documentation/translations/zh_CN/dev-tools/gdb-kernel-debugging.rst b/Documentation/translations/zh_CN/dev-tools/gdb-kernel-debugging.rst index 3c133a918f30..282aacd33442 100644 --- a/Documentation/translations/zh_CN/dev-tools/gdb-kernel-debugging.rst +++ b/Documentation/translations/zh_CN/dev-tools/gdb-kernel-debugging.rst @@ -120,35 +120,31 @@ Kgdb内核调试器、QEMU等虚拟机管理程序或基于JTAG的硬件接口 - 对当前或指定的CPU使用per-cpu函数:: - (gdb) p $lx_per_cpu("runqueues").nr_running + (gdb) p $lx_per_cpu(runqueues).nr_running $3 = 1 - (gdb) p $lx_per_cpu("runqueues", 2).nr_running + (gdb) p $lx_per_cpu(runqueues, 2).nr_running $4 = 0 - 使用container_of查看更多hrtimers信息:: - (gdb) set $next = $lx_per_cpu("hrtimer_bases").clock_base[0].active.next - (gdb) p *$container_of($next, "struct hrtimer", "node") + (gdb) set $leftmost = $lx_per_cpu(hrtimer_bases).clock_base[0].active.rb_root.rb_leftmost + (gdb) p *$container_of($leftmost, "struct hrtimer", "node") $5 = { node = { node = { - __rb_parent_color = 18446612133355256072, - rb_right = 0x0 , - rb_left = 0x0 + __rb_parent_color = 18446612686384860673, + rb_right = 0xffff888231da8b00, + rb_left = 0x0 }, - expires = { - tv64 = 1835268000000 - } + expires = 1228461000000 }, - _softexpires = { - tv64 = 1835268000000 - }, - function = 0xffffffff81078232 , - base = 0xffff88003fd0d6f0, - state = 1, - start_pid = 0, - start_site = 0xffffffff81055c1f , - start_comm = "swapper/2\000\000\000\000\000\000" + _softexpires = 1228461000000, + function = 0xffffffff8137ab20 , + base = 0xffff888231d9b4c0, + state = 1 '\001', + is_rel = 0 '\000', + is_soft = 0 '\000', + is_hard = 1 '\001' } diff --git a/Documentation/translations/zh_TW/dev-tools/gdb-kernel-debugging.rst b/Documentation/translations/zh_TW/dev-tools/gdb-kernel-debugging.rst index c881e8872b19..b595af59ba78 100644 --- a/Documentation/translations/zh_TW/dev-tools/gdb-kernel-debugging.rst +++ b/Documentation/translations/zh_TW/dev-tools/gdb-kernel-debugging.rst @@ -116,35 +116,31 @@ Kgdb內核調試器、QEMU等虛擬機管理程序或基於JTAG的硬件接口 - 對當前或指定的CPU使用per-cpu函數:: - (gdb) p $lx_per_cpu("runqueues").nr_running + (gdb) p $lx_per_cpu(runqueues).nr_running $3 = 1 - (gdb) p $lx_per_cpu("runqueues", 2).nr_running + (gdb) p $lx_per_cpu(runqueues, 2).nr_running $4 = 0 - 使用container_of查看更多hrtimers信息:: - (gdb) set $next = $lx_per_cpu("hrtimer_bases").clock_base[0].active.next - (gdb) p *$container_of($next, "struct hrtimer", "node") + (gdb) set $leftmost = $lx_per_cpu(hrtimer_bases).clock_base[0].active.rb_root.rb_leftmost + (gdb) p *$container_of($leftmost, "struct hrtimer", "node") $5 = { node = { node = { - __rb_parent_color = 18446612133355256072, - rb_right = 0x0 , - rb_left = 0x0 + __rb_parent_color = 18446612686384860673, + rb_right = 0xffff888231da8b00, + rb_left = 0x0 }, - expires = { - tv64 = 1835268000000 - } + expires = 1228461000000 }, - _softexpires = { - tv64 = 1835268000000 - }, - function = 0xffffffff81078232 , - base = 0xffff88003fd0d6f0, - state = 1, - start_pid = 0, - start_site = 0xffffffff81055c1f , - start_comm = "swapper/2\000\000\000\000\000\000" + _softexpires = 1228461000000, + function = 0xffffffff8137ab20 , + base = 0xffff888231d9b4c0, + state = 1 '\001', + is_rel = 0 '\000', + is_soft = 0 '\000', + is_hard = 1 '\001' } diff --git a/scripts/gdb/linux/cpus.py b/scripts/gdb/linux/cpus.py index f506965ea759..6edf4ef61636 100644 --- a/scripts/gdb/linux/cpus.py +++ b/scripts/gdb/linux/cpus.py @@ -141,7 +141,7 @@ LxCpus() class PerCpu(gdb.Function): """Return per-cpu variable. -$lx_per_cpu("VAR"[, CPU]): Return the per-cpu variable called VAR for the +$lx_per_cpu(VAR[, CPU]): Return the per-cpu variable called VAR for the given CPU number. If CPU is omitted, the CPU of the current context is used. Note that VAR has to be quoted as string.""" @@ -158,7 +158,7 @@ PerCpu() class PerCpuPtr(gdb.Function): """Return per-cpu pointer. -$lx_per_cpu_ptr("VAR"[, CPU]): Return the per-cpu pointer called VAR for the +$lx_per_cpu_ptr(VAR[, CPU]): Return the per-cpu pointer called VAR for the given CPU number. If CPU is omitted, the CPU of the current context is used. Note that VAR has to be quoted as string.""" -- cgit v1.2.3-59-g8ed1b From cf80fdbc0a5512feabbc56280b2abbb51d4e5b4e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 May 2025 15:17:42 +0300 Subject: list: remove redundant 'extern' for function prototypes The 'extern' keyword is redundant for function prototypes. list.h never used them and new code in general is better without them. Link: https://lkml.kernel.org/r/20250502121742.3997529-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton --- include/linux/list.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index 29a375889fb8..e7e28afd28f8 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -50,9 +50,9 @@ static inline void INIT_LIST_HEAD(struct list_head *list) * Performs the full set of list corruption checks before __list_add(). * On list corruption reports a warning, and returns false. */ -extern bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new, - struct list_head *prev, - struct list_head *next); +bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new, + struct list_head *prev, + struct list_head *next); /* * Performs list corruption checks before __list_add(). Returns false if a @@ -93,7 +93,7 @@ static __always_inline bool __list_add_valid(struct list_head *new, * Performs the full set of list corruption checks before __list_del_entry(). * On list corruption reports a warning, and returns false. */ -extern bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry); +bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry); /* * Performs list corruption checks before __list_del_entry(). Returns false if a -- cgit v1.2.3-59-g8ed1b From bf454ec31add6790f6cdc88328e38901fcbbade6 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Fri, 2 May 2025 09:12:35 +0800 Subject: kexec_file: allow to place kexec_buf randomly Patch series "Support kdump with LUKS encryption by reusing LUKS volume keys", v9. LUKS is the standard for Linux disk encryption, widely adopted by users, and in some cases, such as Confidential VMs, it is a requirement. With kdump enabled, when the first kernel crashes, the system can boot into the kdump/crash kernel to dump the memory image (i.e., /proc/vmcore) to a specified target. However, there are two challenges when dumping vmcore to a LUKS-encrypted device: - Kdump kernel may not be able to decrypt the LUKS partition. For some machines, a system administrator may not have a chance to enter the password to decrypt the device in kdump initramfs after the 1st kernel crashes; For cloud confidential VMs, depending on the policy the kdump kernel may not be able to unseal the keys with TPM and the console virtual keyboard is untrusted. - LUKS2 by default use the memory-hard Argon2 key derivation function which is quite memory-consuming compared to the limited memory reserved for kdump. Take Fedora example, by default, only 256M is reserved for systems having memory between 4G-64G. With LUKS enabled, ~1300M needs to be reserved for kdump. Note if the memory reserved for kdump can't be used by 1st kernel i.e. an user sees ~1300M memory missing in the 1st kernel. Besides users (at least for Fedora) usually expect kdump to work out of the box i.e. no manual password input or custom crashkernel value is needed. And it doesn't make sense to derivate the keys again in kdump kernel which seems to be redundant work. This patchset addresses the above issues by making the LUKS volume keys persistent for kdump kernel with the help of cryptsetup's new APIs (--link-vk-to-keyring/--volume-key-keyring). Here is the life cycle of the kdump copies of LUKS volume keys, 1. After the 1st kernel loads the initramfs during boot, systemd use an user-input passphrase to de-crypt the LUKS volume keys or TPM-sealed key and then save the volume keys to specified keyring (using the --link-vk-to-keyring API) and the key will expire within specified time. 2. A user space tool (kdump initramfs loader like kdump-utils) create key items inside /sys/kernel/config/crash_dm_crypt_keys to inform the 1st kernel which keys are needed. 3. When the kdump initramfs is loaded by the kexec_file_load syscall, the 1st kernel will iterate created key items, save the keys to kdump reserved memory. 4. When the 1st kernel crashes and the kdump initramfs is booted, the kdump initramfs asks the kdump kernel to create a user key using the key stored in kdump reserved memory by writing yes to /sys/kernel/crash_dm_crypt_keys/restore. Then the LUKS encrypted device is unlocked with libcryptsetup's --volume-key-keyring API. 5. The system gets rebooted to the 1st kernel after dumping vmcore to the LUKS encrypted device is finished After libcryptsetup saving the LUKS volume keys to specified keyring, whoever takes this should be responsible for the safety of these copies of keys. The keys will be saved in the memory area exclusively reserved for kdump where even the 1st kernel has no direct access. And further more, two additional protections are added, - save the copy randomly in kdump reserved memory as suggested by Jan - clear the _PAGE_PRESENT flag of the page that stores the copy as suggested by Pingfan This patchset only supports x86. There will be patches to support other architectures once this patch set gets merged. This patch (of 9): Currently, kexec_buf is placed in order which means for the same machine, the info in the kexec_buf is always located at the same position each time the machine is booted. This may cause a risk for sensitive information like LUKS volume key. Now struct kexec_buf has a new field random which indicates it's supposed to be placed in a random position. Note this feature is enabled only when CONFIG_CRASH_DUMP is enabled. So it only takes effect for kdump and won't impact kexec reboot. Link: https://lkml.kernel.org/r/20250502011246.99238-1-coxu@redhat.com Link: https://lkml.kernel.org/r/20250502011246.99238-2-coxu@redhat.com Signed-off-by: Coiby Xu Suggested-by: Jan Pazdziora Acked-by: Baoquan He Cc: "Daniel P. Berrange" Cc: Dave Hansen Cc: Dave Young Cc: Liu Pingfan Cc: Milan Broz Cc: Ondrej Kozina Cc: Vitaly Kuznetsov Signed-off-by: Andrew Morton --- include/linux/kexec.h | 30 ++++++++++++++++++++++++++++++ kernel/kexec_file.c | 3 +++ 2 files changed, 33 insertions(+) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index c8971861521a..1871eaa95432 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -25,6 +25,10 @@ extern note_buf_t __percpu *crash_notes; +#ifdef CONFIG_CRASH_DUMP +#include +#endif + #ifdef CONFIG_KEXEC_CORE #include #include @@ -169,6 +173,7 @@ int kexec_image_post_load_cleanup_default(struct kimage *image); * @buf_min: The buffer can't be placed below this address. * @buf_max: The buffer can't be placed above this address. * @top_down: Allocate from top of memory. + * @random: Place the buffer at a random position. */ struct kexec_buf { struct kimage *image; @@ -180,8 +185,33 @@ struct kexec_buf { unsigned long buf_min; unsigned long buf_max; bool top_down; +#ifdef CONFIG_CRASH_DUMP + bool random; +#endif }; + +#ifdef CONFIG_CRASH_DUMP +static inline void kexec_random_range_start(unsigned long start, + unsigned long end, + struct kexec_buf *kbuf, + unsigned long *temp_start) +{ + unsigned short i; + + if (kbuf->random) { + get_random_bytes(&i, sizeof(unsigned short)); + *temp_start = start + (end - start) / USHRT_MAX * i; + } +} +#else +static inline void kexec_random_range_start(unsigned long start, + unsigned long end, + struct kexec_buf *kbuf, + unsigned long *temp_start) +{} +#endif + int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf); int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, void *buf, unsigned int size, diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 14e5087b4277..4a88bfc33824 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -444,6 +444,7 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end, temp_end = min(end, kbuf->buf_max); temp_start = temp_end - kbuf->memsz + 1; + kexec_random_range_start(temp_start, temp_end, kbuf, &temp_start); do { /* align down start */ @@ -488,6 +489,8 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, temp_start = max(start, kbuf->buf_min); + kexec_random_range_start(temp_start, end, kbuf, &temp_start); + do { temp_start = ALIGN(temp_start, kbuf->buf_align); temp_end = temp_start + kbuf->memsz - 1; -- cgit v1.2.3-59-g8ed1b From 180cf31af7c313790f1e0fba1c7aa42512144dd5 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Fri, 2 May 2025 09:12:36 +0800 Subject: crash_dump: make dm crypt keys persist for the kdump kernel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A configfs /sys/kernel/config/crash_dm_crypt_keys is provided for user space to make the dm crypt keys persist for the kdump kernel. Take the case of dumping to a LUKS-encrypted target as an example, here is the life cycle of the kdump copies of LUKS volume keys, 1. After the 1st kernel loads the initramfs during boot, systemd uses an user-input passphrase to de-crypt the LUKS volume keys or simply TPM-sealed volume keys and then save the volume keys to specified keyring (using the --link-vk-to-keyring API) and the keys will expire within specified time. 2. A user space tool (kdump initramfs loader like kdump-utils) create key items inside /sys/kernel/config/crash_dm_crypt_keys to inform the 1st kernel which keys are needed. 3. When the kdump initramfs is loaded by the kexec_file_load syscall, the 1st kernel will iterate created key items, save the keys to kdump reserved memory. 4. When the 1st kernel crashes and the kdump initramfs is booted, the kdump initramfs asks the kdump kernel to create a user key using the key stored in kdump reserved memory by writing yes to /sys/kernel/crash_dm_crypt_keys/restore. Then the LUKS encrypted device is unlocked with libcryptsetup's --volume-key-keyring API. 5. The system gets rebooted to the 1st kernel after dumping vmcore to the LUKS encrypted device is finished Eventually the keys have to stay in the kdump reserved memory for the kdump kernel to unlock encrypted volumes. During this process, some measures like letting the keys expire within specified time are desirable to reduce security risk. This patch assumes, 1) there are 128 LUKS devices at maximum to be unlocked thus MAX_KEY_NUM=128. 2) a key description won't exceed 128 bytes thus KEY_DESC_MAX_LEN=128. And here is a demo on how to interact with /sys/kernel/config/crash_dm_crypt_keys, # Add key #1 mkdir /sys/kernel/config/crash_dm_crypt_keys/7d26b7b4-e342-4d2d-b660-7426b0996720 # Add key #1's description echo cryptsetup:7d26b7b4-e342-4d2d-b660-7426b0996720 > /sys/kernel/config/crash_dm_crypt_keys/description # how many keys do we have now? cat /sys/kernel/config/crash_dm_crypt_keys/count 1 # Add key# 2 in the same way # how many keys do we have now? cat /sys/kernel/config/crash_dm_crypt_keys/count 2 # the tree structure of /crash_dm_crypt_keys configfs tree /sys/kernel/config/crash_dm_crypt_keys/ /sys/kernel/config/crash_dm_crypt_keys/ ├── 7d26b7b4-e342-4d2d-b660-7426b0996720 │   └── description ├── count ├── fce2cd38-4d59-4317-8ce2-1fd24d52c46a │   └── description Link: https://lkml.kernel.org/r/20250502011246.99238-3-coxu@redhat.com Signed-off-by: Coiby Xu Acked-by: Baoquan He Cc: "Daniel P. Berrange" Cc: Dave Hansen Cc: Dave Young Cc: Jan Pazdziora Cc: Liu Pingfan Cc: Milan Broz Cc: Ondrej Kozina Cc: Vitaly Kuznetsov Signed-off-by: Andrew Morton --- Documentation/admin-guide/kdump/kdump.rst | 28 ++++++ kernel/Kconfig.kexec | 11 +++ kernel/Makefile | 1 + kernel/crash_dump_dm_crypt.c | 154 ++++++++++++++++++++++++++++++ 4 files changed, 194 insertions(+) create mode 100644 kernel/crash_dump_dm_crypt.c diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index 1f7f14c6e184..b74d3bed8fff 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -547,6 +547,34 @@ from within add_taint() whenever the value set in this bitmask matches with the bit flag being set by add_taint(). This will cause a kdump to occur at the add_taint()->panic() call. +Write the dump file to encrypted disk volume +============================================ + +CONFIG_CRASH_DM_CRYPT can be enabled to support saving the dump file to an +encrypted disk volume. User space can interact with +/sys/kernel/config/crash_dm_crypt_keys for setup, + +1. Tell the first kernel what logon keys are needed to unlock the disk volumes, + # Add key #1 + mkdir /sys/kernel/config/crash_dm_crypt_keys/7d26b7b4-e342-4d2d-b660-7426b0996720 + # Add key #1's description + echo cryptsetup:7d26b7b4-e342-4d2d-b660-7426b0996720 > /sys/kernel/config/crash_dm_crypt_keys/description + + # how many keys do we have now? + cat /sys/kernel/config/crash_dm_crypt_keys/count + 1 + + # Add key #2 in the same way + + # how many keys do we have now? + cat /sys/kernel/config/crash_dm_crypt_keys/count + 2 + +2. Load the dump-capture kernel + +3. After the dump-capture kerne get booted, restore the keys to user keyring + echo yes > /sys/kernel/crash_dm_crypt_keys/restore + Contact ======= diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 9fea9dca15bd..c3e180d5e4fd 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -115,6 +115,17 @@ config CRASH_DUMP For s390, this option also enables zfcpdump. See also +config CRASH_DM_CRYPT + bool "Support saving crash dump to dm-crypt encrypted volume" + depends on KEXEC_FILE + depends on CRASH_DUMP + depends on DM_CRYPT + depends on CONFIGFS_FS + help + With this option enabled, user space can intereact with + /sys/kernel/config/crash_dm_crypt_keys to make the dm crypt keys + persistent for the dump-capture kernel. + config CRASH_HOTPLUG bool "Update the crash elfcorehdr on system configuration changes" default y diff --git a/kernel/Makefile b/kernel/Makefile index 434929de17ef..c7d3793107e5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -77,6 +77,7 @@ obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_CRASH_DUMP) += crash_core.o +obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c new file mode 100644 index 000000000000..62a3c47d8b3b --- /dev/null +++ b/kernel/crash_dump_dm_crypt.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include + +#define KEY_NUM_MAX 128 /* maximum dm crypt keys */ +#define KEY_DESC_MAX_LEN 128 /* maximum dm crypt key description size */ + +static unsigned int key_count; + +struct config_key { + struct config_item item; + const char *description; +}; + +static inline struct config_key *to_config_key(struct config_item *item) +{ + return container_of(item, struct config_key, item); +} + +static ssize_t config_key_description_show(struct config_item *item, char *page) +{ + return sprintf(page, "%s\n", to_config_key(item)->description); +} + +static ssize_t config_key_description_store(struct config_item *item, + const char *page, size_t count) +{ + struct config_key *config_key = to_config_key(item); + size_t len; + int ret; + + ret = -EINVAL; + len = strcspn(page, "\n"); + + if (len > KEY_DESC_MAX_LEN) { + pr_err("The key description shouldn't exceed %u characters", KEY_DESC_MAX_LEN); + return ret; + } + + if (!len) + return ret; + + kfree(config_key->description); + ret = -ENOMEM; + config_key->description = kmemdup_nul(page, len, GFP_KERNEL); + if (!config_key->description) + return ret; + + return count; +} + +CONFIGFS_ATTR(config_key_, description); + +static struct configfs_attribute *config_key_attrs[] = { + &config_key_attr_description, + NULL, +}; + +static void config_key_release(struct config_item *item) +{ + kfree(to_config_key(item)); + key_count--; +} + +static struct configfs_item_operations config_key_item_ops = { + .release = config_key_release, +}; + +static const struct config_item_type config_key_type = { + .ct_item_ops = &config_key_item_ops, + .ct_attrs = config_key_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_item *config_keys_make_item(struct config_group *group, + const char *name) +{ + struct config_key *config_key; + + if (key_count > KEY_NUM_MAX) { + pr_err("Only %u keys at maximum to be created\n", KEY_NUM_MAX); + return ERR_PTR(-EINVAL); + } + + config_key = kzalloc(sizeof(struct config_key), GFP_KERNEL); + if (!config_key) + return ERR_PTR(-ENOMEM); + + config_item_init_type_name(&config_key->item, name, &config_key_type); + + key_count++; + + return &config_key->item; +} + +static ssize_t config_keys_count_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", key_count); +} + +CONFIGFS_ATTR_RO(config_keys_, count); + +static struct configfs_attribute *config_keys_attrs[] = { + &config_keys_attr_count, + NULL, +}; + +/* + * Note that, since no extra work is required on ->drop_item(), + * no ->drop_item() is provided. + */ +static struct configfs_group_operations config_keys_group_ops = { + .make_item = config_keys_make_item, +}; + +static const struct config_item_type config_keys_type = { + .ct_group_ops = &config_keys_group_ops, + .ct_attrs = config_keys_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem config_keys_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "crash_dm_crypt_keys", + .ci_type = &config_keys_type, + }, + }, +}; + +static int __init configfs_dmcrypt_keys_init(void) +{ + int ret; + + config_group_init(&config_keys_subsys.su_group); + mutex_init(&config_keys_subsys.su_mutex); + ret = configfs_register_subsystem(&config_keys_subsys); + if (ret) { + pr_err("Error %d while registering subsystem %s\n", ret, + config_keys_subsys.su_group.cg_item.ci_namebuf); + goto out_unregister; + } + + return 0; + +out_unregister: + configfs_unregister_subsystem(&config_keys_subsys); + + return ret; +} + +module_init(configfs_dmcrypt_keys_init); -- cgit v1.2.3-59-g8ed1b From 479e58549b0fa7e80f1e0b9e69e0a2a8e6711132 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Fri, 2 May 2025 09:12:37 +0800 Subject: crash_dump: store dm crypt keys in kdump reserved memory When the kdump kernel image and initrd are loaded, the dm crypts keys will be read from keyring and then stored in kdump reserved memory. Assume a key won't exceed 256 bytes thus MAX_KEY_SIZE=256 according to "cryptsetup benchmark". Link: https://lkml.kernel.org/r/20250502011246.99238-4-coxu@redhat.com Signed-off-by: Coiby Xu Acked-by: Baoquan He Cc: "Daniel P. Berrange" Cc: Dave Hansen Cc: Dave Young Cc: Jan Pazdziora Cc: Liu Pingfan Cc: Milan Broz Cc: Ondrej Kozina Cc: Vitaly Kuznetsov Signed-off-by: Andrew Morton --- include/linux/crash_core.h | 6 +- include/linux/kexec.h | 4 ++ kernel/crash_dump_dm_crypt.c | 133 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 142 insertions(+), 1 deletion(-) diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 44305336314e..2e6782239034 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -34,7 +34,11 @@ static inline void arch_kexec_protect_crashkres(void) { } static inline void arch_kexec_unprotect_crashkres(void) { } #endif - +#ifdef CONFIG_CRASH_DM_CRYPT +int crash_load_dm_crypt_keys(struct kimage *image); +#else +static inline int crash_load_dm_crypt_keys(struct kimage *image) {return 0; } +#endif #ifndef arch_crash_handle_hotplug_event static inline void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) { } diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 1871eaa95432..6e688c5d8e4d 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -405,6 +405,10 @@ struct kimage { void *elf_headers; unsigned long elf_headers_sz; unsigned long elf_load_addr; + + /* dm crypt keys buffer */ + unsigned long dm_crypt_keys_addr; + unsigned long dm_crypt_keys_sz; }; /* kexec interface functions */ diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c index 62a3c47d8b3b..fb25f55f1512 100644 --- a/kernel/crash_dump_dm_crypt.c +++ b/kernel/crash_dump_dm_crypt.c @@ -1,14 +1,62 @@ // SPDX-License-Identifier: GPL-2.0-only +#include +#include #include #include #include #include #define KEY_NUM_MAX 128 /* maximum dm crypt keys */ +#define KEY_SIZE_MAX 256 /* maximum dm crypt key size */ #define KEY_DESC_MAX_LEN 128 /* maximum dm crypt key description size */ static unsigned int key_count; +struct dm_crypt_key { + unsigned int key_size; + char key_desc[KEY_DESC_MAX_LEN]; + u8 data[KEY_SIZE_MAX]; +}; + +static struct keys_header { + unsigned int total_keys; + struct dm_crypt_key keys[] __counted_by(total_keys); +} *keys_header; + +static size_t get_keys_header_size(size_t total_keys) +{ + return struct_size(keys_header, keys, total_keys); +} + +static int read_key_from_user_keying(struct dm_crypt_key *dm_key) +{ + const struct user_key_payload *ukp; + struct key *key; + + kexec_dprintk("Requesting logon key %s", dm_key->key_desc); + key = request_key(&key_type_logon, dm_key->key_desc, NULL); + + if (IS_ERR(key)) { + pr_warn("No such logon key %s\n", dm_key->key_desc); + return PTR_ERR(key); + } + + ukp = user_key_payload_locked(key); + if (!ukp) + return -EKEYREVOKED; + + if (ukp->datalen > KEY_SIZE_MAX) { + pr_err("Key size %u exceeds maximum (%u)\n", ukp->datalen, KEY_SIZE_MAX); + return -EINVAL; + } + + memcpy(dm_key->data, ukp->data, ukp->datalen); + dm_key->key_size = ukp->datalen; + kexec_dprintk("Get dm crypt key (size=%u) %s: %8ph\n", dm_key->key_size, + dm_key->key_desc, dm_key->data); + return 0; +} + struct config_key { struct config_item item; const char *description; @@ -130,6 +178,91 @@ static struct configfs_subsystem config_keys_subsys = { }, }; +static int build_keys_header(void) +{ + struct config_item *item = NULL; + struct config_key *key; + int i, r; + + if (keys_header != NULL) + kvfree(keys_header); + + keys_header = kzalloc(get_keys_header_size(key_count), GFP_KERNEL); + if (!keys_header) + return -ENOMEM; + + keys_header->total_keys = key_count; + + i = 0; + list_for_each_entry(item, &config_keys_subsys.su_group.cg_children, + ci_entry) { + if (item->ci_type != &config_key_type) + continue; + + key = to_config_key(item); + + if (!key->description) { + pr_warn("No key description for key %s\n", item->ci_name); + return -EINVAL; + } + + strscpy(keys_header->keys[i].key_desc, key->description, + KEY_DESC_MAX_LEN); + r = read_key_from_user_keying(&keys_header->keys[i]); + if (r != 0) { + kexec_dprintk("Failed to read key %s\n", + keys_header->keys[i].key_desc); + return r; + } + i++; + kexec_dprintk("Found key: %s\n", item->ci_name); + } + + return 0; +} + +int crash_load_dm_crypt_keys(struct kimage *image) +{ + struct kexec_buf kbuf = { + .image = image, + .buf_min = 0, + .buf_max = ULONG_MAX, + .top_down = false, + .random = true, + }; + int r; + + + if (key_count <= 0) { + kexec_dprintk("No dm-crypt keys\n"); + return -ENOENT; + } + + image->dm_crypt_keys_addr = 0; + r = build_keys_header(); + if (r) + return r; + + kbuf.buffer = keys_header; + kbuf.bufsz = get_keys_header_size(key_count); + + kbuf.memsz = kbuf.bufsz; + kbuf.buf_align = ELF_CORE_HEADER_ALIGN; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + r = kexec_add_buffer(&kbuf); + if (r) { + kvfree((void *)kbuf.buffer); + return r; + } + image->dm_crypt_keys_addr = kbuf.mem; + image->dm_crypt_keys_sz = kbuf.bufsz; + kexec_dprintk( + "Loaded dm crypt keys to kexec_buffer bufsz=0x%lx memsz=0x%lx\n", + kbuf.bufsz, kbuf.memsz); + + return r; +} + static int __init configfs_dmcrypt_keys_init(void) { int ret; -- cgit v1.2.3-59-g8ed1b From 9ebfa8dcaea77a8ef02d0f9478717a138b0ad828 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Fri, 2 May 2025 09:12:38 +0800 Subject: crash_dump: reuse saved dm crypt keys for CPU/memory hot-plugging When there are CPU and memory hot un/plugs, the dm crypt keys may need to be reloaded again depending on the solution for crash hotplug support. Currently, there are two solutions. One is to utilizes udev to instruct user space to reload the kdump kernel image and initrd, elfcorehdr and etc again. The other is to only update the elfcorehdr segment introduced in commit 247262756121 ("crash: add generic infrastructure for crash hotplug support"). For the 1st solution, the dm crypt keys need to be reloaded again. The user space can write true to /sys/kernel/config/crash_dm_crypt_key/reuse so the stored keys can be re-used. For the 2nd solution, the dm crypt keys don't need to be reloaded. Currently, only x86 supports the 2nd solution. If the 2nd solution gets extended to all arches, this patch can be dropped. Link: https://lkml.kernel.org/r/20250502011246.99238-5-coxu@redhat.com Signed-off-by: Coiby Xu Acked-by: Baoquan He Cc: "Daniel P. Berrange" Cc: Dave Hansen Cc: Dave Young Cc: Jan Pazdziora Cc: Liu Pingfan Cc: Milan Broz Cc: Ondrej Kozina Cc: Vitaly Kuznetsov Signed-off-by: Andrew Morton --- Documentation/admin-guide/kdump/kdump.rst | 4 +++ kernel/crash_dump_dm_crypt.c | 52 ++++++++++++++++++++++++++++--- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index b74d3bed8fff..e25edaa8e533 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -570,6 +570,10 @@ encrypted disk volume. User space can interact with cat /sys/kernel/config/crash_dm_crypt_keys/count 2 + # To support CPU/memory hot-plugging, re-use keys already saved to reserved + # memory + echo true > /sys/kernel/config/crash_dm_crypt_key/reuse + 2. Load the dump-capture kernel 3. After the dump-capture kerne get booted, restore the keys to user keyring diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c index fb25f55f1512..5f4a62389150 100644 --- a/kernel/crash_dump_dm_crypt.c +++ b/kernel/crash_dump_dm_crypt.c @@ -28,6 +28,20 @@ static size_t get_keys_header_size(size_t total_keys) return struct_size(keys_header, keys, total_keys); } +static void get_keys_from_kdump_reserved_memory(void) +{ + struct keys_header *keys_header_loaded; + + arch_kexec_unprotect_crashkres(); + + keys_header_loaded = kmap_local_page(pfn_to_page( + kexec_crash_image->dm_crypt_keys_addr >> PAGE_SHIFT)); + + memcpy(keys_header, keys_header_loaded, get_keys_header_size(key_count)); + kunmap_local(keys_header_loaded); + arch_kexec_protect_crashkres(); +} + static int read_key_from_user_keying(struct dm_crypt_key *dm_key) { const struct user_key_payload *ukp; @@ -150,8 +164,36 @@ static ssize_t config_keys_count_show(struct config_item *item, char *page) CONFIGFS_ATTR_RO(config_keys_, count); +static bool is_dm_key_reused; + +static ssize_t config_keys_reuse_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", is_dm_key_reused); +} + +static ssize_t config_keys_reuse_store(struct config_item *item, + const char *page, size_t count) +{ + if (!kexec_crash_image || !kexec_crash_image->dm_crypt_keys_addr) { + kexec_dprintk( + "dm-crypt keys haven't be saved to crash-reserved memory\n"); + return -EINVAL; + } + + if (kstrtobool(page, &is_dm_key_reused)) + return -EINVAL; + + if (is_dm_key_reused) + get_keys_from_kdump_reserved_memory(); + + return count; +} + +CONFIGFS_ATTR(config_keys_, reuse); + static struct configfs_attribute *config_keys_attrs[] = { &config_keys_attr_count, + &config_keys_attr_reuse, NULL, }; @@ -238,10 +280,12 @@ int crash_load_dm_crypt_keys(struct kimage *image) return -ENOENT; } - image->dm_crypt_keys_addr = 0; - r = build_keys_header(); - if (r) - return r; + if (!is_dm_key_reused) { + image->dm_crypt_keys_addr = 0; + r = build_keys_header(); + if (r) + return r; + } kbuf.buffer = keys_header; kbuf.bufsz = get_keys_header_size(key_count); -- cgit v1.2.3-59-g8ed1b From 62f17d9df6924cf805de5ae970470615c1c8d9f2 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Fri, 2 May 2025 09:12:39 +0800 Subject: crash_dump: retrieve dm crypt keys in kdump kernel Crash kernel will retrieve the dm crypt keys based on the dmcryptkeys command line parameter. When user space writes the key description to /sys/kernel/config/crash_dm_crypt_key/restore, the crash kernel will save the encryption keys to the user keyring. Then user space e.g. cryptsetup's --volume-key-keyring API can use it to unlock the encrypted device. Link: https://lkml.kernel.org/r/20250502011246.99238-6-coxu@redhat.com Signed-off-by: Coiby Xu Acked-by: Baoquan He Cc: "Daniel P. Berrange" Cc: Dave Hansen Cc: Dave Young Cc: Jan Pazdziora Cc: Liu Pingfan Cc: Milan Broz Cc: Ondrej Kozina Cc: Vitaly Kuznetsov Signed-off-by: Andrew Morton --- include/linux/crash_core.h | 1 + include/linux/crash_dump.h | 2 + kernel/crash_dump_dm_crypt.c | 133 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 136 insertions(+) diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 2e6782239034..d35726d6a415 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -36,6 +36,7 @@ static inline void arch_kexec_unprotect_crashkres(void) { } #ifdef CONFIG_CRASH_DM_CRYPT int crash_load_dm_crypt_keys(struct kimage *image); +ssize_t dm_crypt_keys_read(char *buf, size_t count, u64 *ppos); #else static inline int crash_load_dm_crypt_keys(struct kimage *image) {return 0; } #endif diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 2f2555e6407c..dd6fc3b2133b 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -15,6 +15,8 @@ extern unsigned long long elfcorehdr_addr; extern unsigned long long elfcorehdr_size; +extern unsigned long long dm_crypt_keys_addr; + #ifdef CONFIG_CRASH_DUMP extern int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size); extern void elfcorehdr_free(unsigned long long addr); diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c index 5f4a62389150..401423ba477d 100644 --- a/kernel/crash_dump_dm_crypt.c +++ b/kernel/crash_dump_dm_crypt.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -28,6 +29,61 @@ static size_t get_keys_header_size(size_t total_keys) return struct_size(keys_header, keys, total_keys); } +unsigned long long dm_crypt_keys_addr; +EXPORT_SYMBOL_GPL(dm_crypt_keys_addr); + +static int __init setup_dmcryptkeys(char *arg) +{ + char *end; + + if (!arg) + return -EINVAL; + dm_crypt_keys_addr = memparse(arg, &end); + if (end > arg) + return 0; + + dm_crypt_keys_addr = 0; + return -EINVAL; +} + +early_param("dmcryptkeys", setup_dmcryptkeys); + +/* + * Architectures may override this function to read dm crypt keys + */ +ssize_t __weak dm_crypt_keys_read(char *buf, size_t count, u64 *ppos) +{ + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ, &kvec, 1, count); + return read_from_oldmem(&iter, count, ppos, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); +} + +static int add_key_to_keyring(struct dm_crypt_key *dm_key, + key_ref_t keyring_ref) +{ + key_ref_t key_ref; + int r; + + /* create or update the requested key and add it to the target keyring */ + key_ref = key_create_or_update(keyring_ref, "user", dm_key->key_desc, + dm_key->data, dm_key->key_size, + KEY_USR_ALL, KEY_ALLOC_IN_QUOTA); + + if (!IS_ERR(key_ref)) { + r = key_ref_to_ptr(key_ref)->serial; + key_ref_put(key_ref); + kexec_dprintk("Success adding key %s", dm_key->key_desc); + } else { + r = PTR_ERR(key_ref); + kexec_dprintk("Error when adding key"); + } + + key_ref_put(keyring_ref); + return r; +} + static void get_keys_from_kdump_reserved_memory(void) { struct keys_header *keys_header_loaded; @@ -42,6 +98,47 @@ static void get_keys_from_kdump_reserved_memory(void) arch_kexec_protect_crashkres(); } +static int restore_dm_crypt_keys_to_thread_keyring(void) +{ + struct dm_crypt_key *key; + size_t keys_header_size; + key_ref_t keyring_ref; + u64 addr; + + /* find the target keyring (which must be writable) */ + keyring_ref = + lookup_user_key(KEY_SPEC_USER_KEYRING, 0x01, KEY_NEED_WRITE); + if (IS_ERR(keyring_ref)) { + kexec_dprintk("Failed to get the user keyring\n"); + return PTR_ERR(keyring_ref); + } + + addr = dm_crypt_keys_addr; + dm_crypt_keys_read((char *)&key_count, sizeof(key_count), &addr); + if (key_count < 0 || key_count > KEY_NUM_MAX) { + kexec_dprintk("Failed to read the number of dm-crypt keys\n"); + return -1; + } + + kexec_dprintk("There are %u keys\n", key_count); + addr = dm_crypt_keys_addr; + + keys_header_size = get_keys_header_size(key_count); + keys_header = kzalloc(keys_header_size, GFP_KERNEL); + if (!keys_header) + return -ENOMEM; + + dm_crypt_keys_read((char *)keys_header, keys_header_size, &addr); + + for (int i = 0; i < keys_header->total_keys; i++) { + key = &keys_header->keys[i]; + kexec_dprintk("Get key (size=%u)\n", key->key_size); + add_key_to_keyring(key, keyring_ref); + } + + return 0; +} + static int read_key_from_user_keying(struct dm_crypt_key *dm_key) { const struct user_key_payload *ukp; @@ -211,6 +308,37 @@ static const struct config_item_type config_keys_type = { .ct_owner = THIS_MODULE, }; +static bool restore; + +static ssize_t config_keys_restore_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", restore); +} + +static ssize_t config_keys_restore_store(struct config_item *item, + const char *page, size_t count) +{ + if (!restore) + restore_dm_crypt_keys_to_thread_keyring(); + + if (kstrtobool(page, &restore)) + return -EINVAL; + + return count; +} + +CONFIGFS_ATTR(config_keys_, restore); + +static struct configfs_attribute *kdump_config_keys_attrs[] = { + &config_keys_attr_restore, + NULL, +}; + +static const struct config_item_type kdump_config_keys_type = { + .ct_attrs = kdump_config_keys_attrs, + .ct_owner = THIS_MODULE, +}; + static struct configfs_subsystem config_keys_subsys = { .su_group = { .cg_item = { @@ -311,6 +439,11 @@ static int __init configfs_dmcrypt_keys_init(void) { int ret; + if (is_kdump_kernel()) { + config_keys_subsys.su_group.cg_item.ci_type = + &kdump_config_keys_type; + } + config_group_init(&config_keys_subsys.su_group); mutex_init(&config_keys_subsys.su_mutex); ret = configfs_register_subsystem(&config_keys_subsys); -- cgit v1.2.3-59-g8ed1b From e1e6cd01d93359e22be84a23c8bb24ee4e04e142 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Fri, 2 May 2025 09:12:40 +0800 Subject: Revert "x86/mm: Remove unused __set_memory_prot()" This reverts commit 693bbf2a50447353c6a47961e6a7240a823ace02 as kdump LUKS support (CONFIG_CRASH_DM_CRYPT) depends on __set_memory_prot. [akpm@linux-foundation.org: x86 set_memory.h needs pgtable_types.h] Link: https://lkml.kernel.org/r/20250502011246.99238-7-coxu@redhat.com Signed-off-by: Coiby Xu Cc: Baoquan He Cc: "Daniel P. Berrange" Cc: Dave Hansen Cc: Dave Young Cc: Jan Pazdziora Cc: Liu Pingfan Cc: Milan Broz Cc: Ondrej Kozina Cc: Vitaly Kuznetsov Signed-off-by: Andrew Morton --- arch/x86/include/asm/set_memory.h | 2 ++ arch/x86/mm/pat/set_memory.c | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 8d9f1c9aaa4c..61f56cdaccb5 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -4,6 +4,7 @@ #include #include +#include #define set_memory_rox set_memory_rox int set_memory_rox(unsigned long addr, int numpages); @@ -37,6 +38,7 @@ int set_memory_rox(unsigned long addr, int numpages); * The caller is required to take care of these. */ +int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot); int _set_memory_uc(unsigned long addr, int numpages); int _set_memory_wc(unsigned long addr, int numpages); int _set_memory_wt(unsigned long addr, int numpages); diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index def3d9284254..df7502ad165c 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2148,6 +2148,19 @@ static inline int cpa_clear_pages_array(struct page **pages, int numpages, CPA_PAGES_ARRAY, pages); } +/* + * __set_memory_prot is an internal helper for callers that have been passed + * a pgprot_t value from upper layers and a reservation has already been taken. + * If you want to set the pgprot to a specific page protocol, use the + * set_memory_xx() functions. + */ +int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot) +{ + return change_page_attr_set_clr(&addr, numpages, prot, + __pgprot(~pgprot_val(prot)), 0, 0, + NULL); +} + int _set_memory_uc(unsigned long addr, int numpages) { /* -- cgit v1.2.3-59-g8ed1b From 5eb3f60554216b02e9c0f18356709ee4befe72e7 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Fri, 2 May 2025 09:12:41 +0800 Subject: x86/crash: pass dm crypt keys to kdump kernel 1st kernel will build up the kernel command parameter dmcryptkeys as similar to elfcorehdr to pass the memory address of the stored info of dm crypt key to kdump kernel. Link: https://lkml.kernel.org/r/20250502011246.99238-8-coxu@redhat.com Signed-off-by: Coiby Xu Acked-by: Baoquan He Cc: "Daniel P. Berrange" Cc: Dave Hansen Cc: Dave Young Cc: Jan Pazdziora Cc: Liu Pingfan Cc: Milan Broz Cc: Ondrej Kozina Cc: Vitaly Kuznetsov Signed-off-by: Andrew Morton --- Documentation/admin-guide/kdump/kdump.rst | 4 ++-- arch/x86/kernel/crash.c | 26 ++++++++++++++++++++++++-- arch/x86/kernel/kexec-bzimage64.c | 21 +++++++++++++++++++++ 3 files changed, 47 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index e25edaa8e533..20fabdf6567e 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -551,8 +551,8 @@ Write the dump file to encrypted disk volume ============================================ CONFIG_CRASH_DM_CRYPT can be enabled to support saving the dump file to an -encrypted disk volume. User space can interact with -/sys/kernel/config/crash_dm_crypt_keys for setup, +encrypted disk volume (only x86_64 supported for now). User space can interact +with /sys/kernel/config/crash_dm_crypt_keys for setup, 1. Tell the first kernel what logon keys are needed to unlock the disk volumes, # Add key #1 diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 0be61c45400c..bcb534688dfe 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -278,6 +278,7 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, unsigned long long mend) { unsigned long start, end; + int ret; cmem->ranges[0].start = mstart; cmem->ranges[0].end = mend; @@ -286,22 +287,43 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, /* Exclude elf header region */ start = image->elf_load_addr; end = start + image->elf_headers_sz - 1; - return crash_exclude_mem_range(cmem, start, end); + ret = crash_exclude_mem_range(cmem, start, end); + + if (ret) + return ret; + + /* Exclude dm crypt keys region */ + if (image->dm_crypt_keys_addr) { + start = image->dm_crypt_keys_addr; + end = start + image->dm_crypt_keys_sz - 1; + return crash_exclude_mem_range(cmem, start, end); + } + + return ret; } /* Prepare memory map for crash dump kernel */ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) { + unsigned int nr_ranges = 0; int i, ret = 0; unsigned long flags; struct e820_entry ei; struct crash_memmap_data cmd; struct crash_mem *cmem; - cmem = vzalloc(struct_size(cmem, ranges, 1)); + /* + * Using random kexec_buf for passing dm crypt keys may cause a range + * split. So use two slots here. + */ + nr_ranges = 2; + cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return -ENOMEM; + cmem->max_nr_ranges = nr_ranges; + cmem->nr_ranges = 0; + memset(&cmd, 0, sizeof(struct crash_memmap_data)); cmd.params = params; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 68530fad05f7..c71cdd8e425a 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -27,6 +27,8 @@ #include #define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */ +#define MAX_DMCRYPTKEYS_STR_LEN 31 /* dmcryptkeys=0x<64bit-value> */ + /* * Defines lowest physical address for various segments. Not sure where @@ -76,6 +78,10 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params, if (image->type == KEXEC_TYPE_CRASH) { len = sprintf(cmdline_ptr, "elfcorehdr=0x%lx ", image->elf_load_addr); + + if (image->dm_crypt_keys_addr != 0) + len += sprintf(cmdline_ptr + len, + "dmcryptkeys=0x%lx ", image->dm_crypt_keys_addr); } memcpy(cmdline_ptr + len, cmdline, cmdline_len); cmdline_len += len; @@ -441,6 +447,19 @@ static void *bzImage64_load(struct kimage *image, char *kernel, ret = crash_load_segments(image); if (ret) return ERR_PTR(ret); + ret = crash_load_dm_crypt_keys(image); + if (ret == -ENOENT) { + kexec_dprintk("No dm crypt key to load\n"); + } else if (ret) { + pr_err("Failed to load dm crypt keys\n"); + return ERR_PTR(ret); + } + if (image->dm_crypt_keys_addr && + cmdline_len + MAX_ELFCOREHDR_STR_LEN + MAX_DMCRYPTKEYS_STR_LEN > + header->cmdline_size) { + pr_err("Appending dmcryptkeys= to command line exceeds maximum allowed length\n"); + return ERR_PTR(-EINVAL); + } } #endif @@ -468,6 +487,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel, efi_map_sz = efi_get_runtime_map_size(); params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + MAX_ELFCOREHDR_STR_LEN; + if (image->dm_crypt_keys_addr) + params_cmdline_sz += MAX_DMCRYPTKEYS_STR_LEN; params_cmdline_sz = ALIGN(params_cmdline_sz, 16); kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) + sizeof(struct setup_data) + -- cgit v1.2.3-59-g8ed1b From cc66e4863ac3a00c709bfcbec685d48c184172b5 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Fri, 2 May 2025 09:12:42 +0800 Subject: x86/crash: make the page that stores the dm crypt keys inaccessible This adds an addition layer of protection for the saved copy of dm crypt key. Trying to access the saved copy will cause page fault. Link: https://lkml.kernel.org/r/20250502011246.99238-9-coxu@redhat.com Signed-off-by: Coiby Xu Suggested-by: Pingfan Liu Acked-by: Baoquan He Cc: "Daniel P. Berrange" Cc: Dave Hansen Cc: Dave Young Cc: Jan Pazdziora Cc: Milan Broz Cc: Ondrej Kozina Cc: Vitaly Kuznetsov Signed-off-by: Andrew Morton --- arch/x86/kernel/machine_kexec_64.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index a68f5a0a9f37..f615fcb6d35d 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -598,13 +598,35 @@ static void kexec_mark_crashkres(bool protect) kexec_mark_range(control, crashk_res.end, protect); } +/* make the memory storing dm crypt keys in/accessible */ +static void kexec_mark_dm_crypt_keys(bool protect) +{ + unsigned long start_paddr, end_paddr; + unsigned int nr_pages; + + if (kexec_crash_image->dm_crypt_keys_addr) { + start_paddr = kexec_crash_image->dm_crypt_keys_addr; + end_paddr = start_paddr + kexec_crash_image->dm_crypt_keys_sz - 1; + nr_pages = (PAGE_ALIGN(end_paddr) - PAGE_ALIGN_DOWN(start_paddr))/PAGE_SIZE; + if (protect) + set_memory_np((unsigned long)phys_to_virt(start_paddr), nr_pages); + else + __set_memory_prot( + (unsigned long)phys_to_virt(start_paddr), + nr_pages, + __pgprot(_PAGE_PRESENT | _PAGE_NX | _PAGE_RW)); + } +} + void arch_kexec_protect_crashkres(void) { kexec_mark_crashkres(true); + kexec_mark_dm_crypt_keys(true); } void arch_kexec_unprotect_crashkres(void) { + kexec_mark_dm_crypt_keys(false); kexec_mark_crashkres(false); } #endif -- cgit v1.2.3-59-g8ed1b From aaf05e96e93cb9c8fc8d35fc4525715530397655 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Sun, 4 May 2025 20:08:30 +0200 Subject: kernel/watchdog: add /sys/kernel/{hard,soft}lockup_count Patch series "sysfs: add counters for lockups and stalls", v2. Commits 9db89b411170 ("exit: Expose "oops_count" to sysfs") and 8b05aa263361 ("panic: Expose "warn_count" to sysfs") added counters for oopses and warnings to sysfs, and these two patches do the same for hard/soft lockups and RCU stalls. All of these counters are useful for monitoring tools to detect whether the machine is healthy. If the kernel has experienced a lockup or a stall, it's probably due to a kernel bug, and I'd like to detect that quickly and easily. There is currently no way to detect that, other than parsing dmesg. Or observing indirect effects: such as certain tasks not responding, but then I need to observe all tasks, and it may take a while until these effects become visible/measurable. I'd rather be able to detect the primary cause more quickly, possibly before everything falls apart. This patch (of 2): There is /proc/sys/kernel/hung_task_detect_count, /sys/kernel/warn_count and /sys/kernel/oops_count but there is no userspace-accessible counter for hard/soft lockups. Having this is useful for monitoring tools. Link: https://lkml.kernel.org/r/20250504180831.4190860-1-max.kellermann@ionos.com Link: https://lkml.kernel.org/r/20250504180831.4190860-2-max.kellermann@ionos.com Signed-off-by: Max Kellermann Cc: Cc: Core Minyard Cc: Doug Anderson Cc: Joel Granados Cc: Song Liu Cc: Kees Cook Signed-off-by: Andrew Morton --- .../ABI/testing/sysfs-kernel-hardlockup_count | 7 +++ .../ABI/testing/sysfs-kernel-softlockup_count | 7 +++ kernel/watchdog.c | 53 ++++++++++++++++++++++ 3 files changed, 67 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-kernel-hardlockup_count create mode 100644 Documentation/ABI/testing/sysfs-kernel-softlockup_count diff --git a/Documentation/ABI/testing/sysfs-kernel-hardlockup_count b/Documentation/ABI/testing/sysfs-kernel-hardlockup_count new file mode 100644 index 000000000000..dfdd4078b077 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-hardlockup_count @@ -0,0 +1,7 @@ +What: /sys/kernel/hardlockup_count +Date: May 2025 +KernelVersion: 6.16 +Contact: Linux kernel mailing list +Description: + Shows how many times the system has detected a hard lockup since last boot. + Available only if CONFIG_HARDLOCKUP_DETECTOR is enabled. diff --git a/Documentation/ABI/testing/sysfs-kernel-softlockup_count b/Documentation/ABI/testing/sysfs-kernel-softlockup_count new file mode 100644 index 000000000000..337ff5531b5f --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-softlockup_count @@ -0,0 +1,7 @@ +What: /sys/kernel/softlockup_count +Date: May 2025 +KernelVersion: 6.16 +Contact: Linux kernel mailing list +Description: + Shows how many times the system has detected a soft lockup since last boot. + Available only if CONFIG_SOFTLOCKUP_DETECTOR is enabled. diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 2d283e92be5a..80b56c002c7f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -64,6 +64,29 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace; */ unsigned int __read_mostly hardlockup_panic = IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); + +#ifdef CONFIG_SYSFS + +static unsigned int hardlockup_count; + +static ssize_t hardlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%u\n", hardlockup_count); +} + +static struct kobj_attribute hardlockup_count_attr = __ATTR_RO(hardlockup_count); + +static __init int kernel_hardlockup_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &hardlockup_count_attr.attr, NULL); + return 0; +} + +late_initcall(kernel_hardlockup_sysfs_init); + +#endif // CONFIG_SYSFS + /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -170,6 +193,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) unsigned int this_cpu = smp_processor_id(); unsigned long flags; +#ifdef CONFIG_SYSFS + ++hardlockup_count; +#endif + /* Only print hardlockups once. */ if (per_cpu(watchdog_hardlockup_warned, cpu)) return; @@ -312,6 +339,28 @@ unsigned int __read_mostly softlockup_panic = static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; +#ifdef CONFIG_SYSFS + +static unsigned int softlockup_count; + +static ssize_t softlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%u\n", softlockup_count); +} + +static struct kobj_attribute softlockup_count_attr = __ATTR_RO(softlockup_count); + +static __init int kernel_softlockup_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &softlockup_count_attr.attr, NULL); + return 0; +} + +late_initcall(kernel_softlockup_sysfs_init); + +#endif // CONFIG_SYSFS + /* Timestamp taken after the last successful reschedule. */ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); /* Timestamp of the last softlockup report. */ @@ -743,6 +792,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) touch_ts = __this_cpu_read(watchdog_touch_ts); duration = is_softlockup(touch_ts, period_ts, now); if (unlikely(duration)) { +#ifdef CONFIG_SYSFS + ++softlockup_count; +#endif + /* * Prevent multiple soft-lockup reports if one cpu is already * engaged in dumping all cpu back traces. -- cgit v1.2.3-59-g8ed1b From 2536c5c7d6ae5e1d844aa21f28b326b5e7f815ef Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Sun, 4 May 2025 20:08:31 +0200 Subject: kernel/rcu/tree_stall: add /sys/kernel/rcu_stall_count Expose a simple counter to userspace for monitoring tools. Link: https://lkml.kernel.org/r/20250504180831.4190860-3-max.kellermann@ionos.com Signed-off-by: Max Kellermann Cc: Core Minyard Cc: Doug Anderson Cc: Joel Granados Cc: Kees Cook Cc: Song Liu Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-rcu_stall_count | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-kernel-rcu_stall_count diff --git a/Documentation/ABI/testing/sysfs-kernel-rcu_stall_count b/Documentation/ABI/testing/sysfs-kernel-rcu_stall_count new file mode 100644 index 000000000000..a4a97a7f4a4d --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-rcu_stall_count @@ -0,0 +1,6 @@ +What: /sys/kernel/rcu_stall_count +Date: May 2025 +KernelVersion: 6.16 +Contact: Linux kernel mailing list +Description: + Shows how many times the system has detected an RCU stall since last boot. -- cgit v1.2.3-59-g8ed1b From 85e1f758b6d770c9d9a7c7de4937e2eb2c200aea Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 9 May 2025 08:29:26 +0200 Subject: fork: clean-up ifdef logic around stack allocation Patch series "fork: Page operation cleanups in the fork code", v3. This patchset consists of outtakes from a 1 year+ old patchset from Pasha, which all stand on their own. See: https://lore.kernel.org/all/20240311164638.2015063-1-pasha.tatashin@soleen.com/ These are good cleanups for readability so I split these off, rebased on v6.15-rc1, addressed review comments and send them separately. All mentions of dynamic stack are removed from the patchset as we have no idea whether that will go anywhere. This patch (of 3): There is unneeded OR in the ifdef functions that are used to allocate and free kernel stacks based on direct map or vmap. Therefore, clean up by changing the order so OR is no longer needed. [linus.walleij@linaro.org: rebased] Link: https://lkml.kernel.org/r/20250509-fork-fixes-v3-1-e6c69dd356f2@linaro.org Link: https://lkml.kernel.org/r/20250509-fork-fixes-v3-0-e6c69dd356f2@linaro.org Signed-off-by: Pasha Tatashin Link: https://lore.kernel.org/20240311164638.2015063-3-pasha.tatashin@soleen.com Signed-off-by: Linus Walleij Acked-by: Mike Rapoport (Microsoft) Signed-off-by: Andrew Morton --- kernel/fork.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index c4b26cd8998b..7b9e1ad141ba 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -185,13 +185,7 @@ static inline void free_task_struct(struct task_struct *tsk) kmem_cache_free(task_struct_cachep, tsk); } -/* - * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a - * kmemcache based allocator. - */ -# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) - -# ifdef CONFIG_VMAP_STACK +#ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks. @@ -342,7 +336,13 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack_vm_area = NULL; } -# else /* !CONFIG_VMAP_STACK */ +#else /* !CONFIG_VMAP_STACK */ + +/* + * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a + * kmemcache based allocator. + */ +#if THREAD_SIZE >= PAGE_SIZE static void thread_stack_free_rcu(struct rcu_head *rh) { @@ -374,8 +374,7 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack = NULL; } -# endif /* CONFIG_VMAP_STACK */ -# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ +#else /* !(THREAD_SIZE >= PAGE_SIZE) */ static struct kmem_cache *thread_stack_cache; @@ -414,7 +413,8 @@ void thread_stack_cache_init(void) BUG_ON(thread_stack_cache == NULL); } -# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ +#endif /* THREAD_SIZE >= PAGE_SIZE */ +#endif /* CONFIG_VMAP_STACK */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; -- cgit v1.2.3-59-g8ed1b From 90eb270d8eb46c2d54a13b938643fbc9cf56eaea Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 9 May 2025 08:29:27 +0200 Subject: fork: clean-up naming of vm_stack/vm_struct variables in vmap stacks code There are two data types: "struct vm_struct" and "struct vm_stack" that have the same local variable names: vm_stack, or vm, or s, which makes the code confusing to read. Change the code so the naming is consistent: struct vm_struct is always called vm_area struct vm_stack is always called vm_stack One change altering vfree(vm_stack) to vfree(vm_area->addr) may look like a semantic change but it is not: vm_area->addr points to the vm_stack. This was done to improve readability. [linus.walleij@linaro.org: rebased and added new users of the variable names, address review comments] Link: https://lore.kernel.org/20240311164638.2015063-4-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20250509-fork-fixes-v3-2-e6c69dd356f2@linaro.org Signed-off-by: Pasha Tatashin Signed-off-by: Linus Walleij Acked-by: Mike Rapoport (Microsoft) Signed-off-by: Andrew Morton --- kernel/fork.c | 60 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 7b9e1ad141ba..8b8457562740 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -198,14 +198,14 @@ struct vm_stack { struct vm_struct *stack_vm_area; }; -static bool try_release_thread_stack_to_cache(struct vm_struct *vm) +static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area) { unsigned int i; for (i = 0; i < NR_CACHED_STACKS; i++) { struct vm_struct *tmp = NULL; - if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm)) + if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) return true; } return false; @@ -214,11 +214,12 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm) static void thread_stack_free_rcu(struct rcu_head *rh) { struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu); + struct vm_struct *vm_area = vm_stack->stack_vm_area; if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) return; - vfree(vm_stack); + vfree(vm_area->addr); } static void thread_stack_delayed_free(struct task_struct *tsk) @@ -231,32 +232,32 @@ static void thread_stack_delayed_free(struct task_struct *tsk) static int free_vm_stack_cache(unsigned int cpu) { - struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); + struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu); int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *vm_stack = cached_vm_stacks[i]; + struct vm_struct *vm_area = cached_vm_stack_areas[i]; - if (!vm_stack) + if (!vm_area) continue; - vfree(vm_stack->addr); - cached_vm_stacks[i] = NULL; + vfree(vm_area->addr); + cached_vm_stack_areas[i] = NULL; } return 0; } -static int memcg_charge_kernel_stack(struct vm_struct *vm) +static int memcg_charge_kernel_stack(struct vm_struct *vm_area) { int i; int ret; int nr_charged = 0; - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); + ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0); if (ret) goto err; nr_charged++; @@ -264,38 +265,35 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm) return 0; err: for (i = 0; i < nr_charged; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); + memcg_kmem_uncharge_page(vm_area->pages[i], 0); return ret; } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { - struct vm_struct *vm; + struct vm_struct *vm_area; void *stack; int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *s; - - s = this_cpu_xchg(cached_stacks[i], NULL); - - if (!s) + vm_area = this_cpu_xchg(cached_stacks[i], NULL); + if (!vm_area) continue; /* Reset stack metadata. */ - kasan_unpoison_range(s->addr, THREAD_SIZE); + kasan_unpoison_range(vm_area->addr, THREAD_SIZE); - stack = kasan_reset_tag(s->addr); + stack = kasan_reset_tag(vm_area->addr); /* Clear stale pointers from reused stack. */ memset(stack, 0, THREAD_SIZE); - if (memcg_charge_kernel_stack(s)) { - vfree(s->addr); + if (memcg_charge_kernel_stack(vm_area)) { + vfree(vm_area->addr); return -ENOMEM; } - tsk->stack_vm_area = s; + tsk->stack_vm_area = vm_area; tsk->stack = stack; return 0; } @@ -311,8 +309,8 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) if (!stack) return -ENOMEM; - vm = find_vm_area(stack); - if (memcg_charge_kernel_stack(vm)) { + vm_area = find_vm_area(stack); + if (memcg_charge_kernel_stack(vm_area)) { vfree(stack); return -ENOMEM; } @@ -321,7 +319,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ - tsk->stack_vm_area = vm; + tsk->stack_vm_area = vm_area; stack = kasan_reset_tag(stack); tsk->stack = stack; return 0; @@ -517,11 +515,11 @@ void vm_area_free(struct vm_area_struct *vma) static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm = task_stack_vm_area(tsk); + struct vm_struct *vm_area = task_stack_vm_area(tsk); int i; for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, + mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB, account * (PAGE_SIZE / 1024)); } else { void *stack = task_stack_page(tsk); @@ -537,12 +535,12 @@ void exit_task_stack_account(struct task_struct *tsk) account_kernel_stack(tsk, -1); if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm; + struct vm_struct *vm_area; int i; - vm = task_stack_vm_area(tsk); + vm_area = task_stack_vm_area(tsk); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); + memcg_kmem_uncharge_page(vm_area->pages[i], 0); } } -- cgit v1.2.3-59-g8ed1b From d82893c52a64dd5698362ba1d7cb232a18839c87 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 9 May 2025 08:29:28 +0200 Subject: fork: check charging success before zeroing stack No need to do zero cached stack if memcg charge fails, so move the charging attempt before the memset operation. [linus.walleij@linaro.org: rebased] Link: https://lkml.kernel.org/r/20250509-fork-fixes-v3-3-e6c69dd356f2@linaro.org Signed-off-by: Pasha Tatashin Link: https://lore.kernel.org/20240311164638.2015063-6-pasha.tatashin@soleen.com Signed-off-by: Linus Walleij Acked-by: Mike Rapoport (Microsoft) Signed-off-by: Andrew Morton --- kernel/fork.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 8b8457562740..d6907c49ee87 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -280,6 +280,11 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) if (!vm_area) continue; + if (memcg_charge_kernel_stack(vm_area)) { + vfree(vm_area->addr); + return -ENOMEM; + } + /* Reset stack metadata. */ kasan_unpoison_range(vm_area->addr, THREAD_SIZE); @@ -288,11 +293,6 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) /* Clear stale pointers from reused stack. */ memset(stack, 0, THREAD_SIZE); - if (memcg_charge_kernel_stack(vm_area)) { - vfree(vm_area->addr); - return -ENOMEM; - } - tsk->stack_vm_area = vm_area; tsk->stack = stack; return 0; -- cgit v1.2.3-59-g8ed1b From 8e02b1b7fcffcde7cc7d7749513ac1b0fbbfcf0a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 9 May 2025 09:25:09 +0200 Subject: fork: define a local GFP_VMAP_STACK The current allocation of VMAP stack memory is using (THREADINFO_GFP & ~__GFP_ACCOUNT) which is a complicated way of saying (GFP_KERNEL | __GFP_ZERO): : define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) : define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) This is an unfortunate side-effect of independent changes blurring the picture: commit 19809c2da28aee5860ad9a2eff760730a0710df0 changed (THREADINFO_GFP | __GFP_HIGHMEM) to just THREADINFO_GFP since highmem became implicit. commit 9b6f7e163cd0f468d1b9696b785659d3c27c8667 then added stack caching and rewrote the allocation to (THREADINFO_GFP & ~__GFP_ACCOUNT) as cached stacks need to be accounted separately. However that code, when it eventually accounts the memory does this: ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0) so the memory is charged as a GFP_KERNEL allocation. Define a unique GFP_VMAP_STACK to use GFP_KERNEL | __GFP_ZERO and move the comment there. Link: https://lkml.kernel.org/r/20250509-gfp-stack-v1-1-82f6f7efc210@linaro.org Signed-off-by: Linus Walleij Reported-by: Mateusz Guzik Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- kernel/fork.c | 88 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 45 insertions(+), 43 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index d6907c49ee87..c4b26cd8998b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -185,7 +185,13 @@ static inline void free_task_struct(struct task_struct *tsk) kmem_cache_free(task_struct_cachep, tsk); } -#ifdef CONFIG_VMAP_STACK +/* + * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a + * kmemcache based allocator. + */ +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) + +# ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks. @@ -198,14 +204,14 @@ struct vm_stack { struct vm_struct *stack_vm_area; }; -static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area) +static bool try_release_thread_stack_to_cache(struct vm_struct *vm) { unsigned int i; for (i = 0; i < NR_CACHED_STACKS; i++) { struct vm_struct *tmp = NULL; - if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) + if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm)) return true; } return false; @@ -214,12 +220,11 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area) static void thread_stack_free_rcu(struct rcu_head *rh) { struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu); - struct vm_struct *vm_area = vm_stack->stack_vm_area; if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) return; - vfree(vm_area->addr); + vfree(vm_stack); } static void thread_stack_delayed_free(struct task_struct *tsk) @@ -232,32 +237,32 @@ static void thread_stack_delayed_free(struct task_struct *tsk) static int free_vm_stack_cache(unsigned int cpu) { - struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu); + struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *vm_area = cached_vm_stack_areas[i]; + struct vm_struct *vm_stack = cached_vm_stacks[i]; - if (!vm_area) + if (!vm_stack) continue; - vfree(vm_area->addr); - cached_vm_stack_areas[i] = NULL; + vfree(vm_stack->addr); + cached_vm_stacks[i] = NULL; } return 0; } -static int memcg_charge_kernel_stack(struct vm_struct *vm_area) +static int memcg_charge_kernel_stack(struct vm_struct *vm) { int i; int ret; int nr_charged = 0; - BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE); + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0); + ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); if (ret) goto err; nr_charged++; @@ -265,35 +270,38 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm_area) return 0; err: for (i = 0; i < nr_charged; i++) - memcg_kmem_uncharge_page(vm_area->pages[i], 0); + memcg_kmem_uncharge_page(vm->pages[i], 0); return ret; } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { - struct vm_struct *vm_area; + struct vm_struct *vm; void *stack; int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - vm_area = this_cpu_xchg(cached_stacks[i], NULL); - if (!vm_area) - continue; + struct vm_struct *s; - if (memcg_charge_kernel_stack(vm_area)) { - vfree(vm_area->addr); - return -ENOMEM; - } + s = this_cpu_xchg(cached_stacks[i], NULL); + + if (!s) + continue; /* Reset stack metadata. */ - kasan_unpoison_range(vm_area->addr, THREAD_SIZE); + kasan_unpoison_range(s->addr, THREAD_SIZE); - stack = kasan_reset_tag(vm_area->addr); + stack = kasan_reset_tag(s->addr); /* Clear stale pointers from reused stack. */ memset(stack, 0, THREAD_SIZE); - tsk->stack_vm_area = vm_area; + if (memcg_charge_kernel_stack(s)) { + vfree(s->addr); + return -ENOMEM; + } + + tsk->stack_vm_area = s; tsk->stack = stack; return 0; } @@ -309,8 +317,8 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) if (!stack) return -ENOMEM; - vm_area = find_vm_area(stack); - if (memcg_charge_kernel_stack(vm_area)) { + vm = find_vm_area(stack); + if (memcg_charge_kernel_stack(vm)) { vfree(stack); return -ENOMEM; } @@ -319,7 +327,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ - tsk->stack_vm_area = vm_area; + tsk->stack_vm_area = vm; stack = kasan_reset_tag(stack); tsk->stack = stack; return 0; @@ -334,13 +342,7 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack_vm_area = NULL; } -#else /* !CONFIG_VMAP_STACK */ - -/* - * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a - * kmemcache based allocator. - */ -#if THREAD_SIZE >= PAGE_SIZE +# else /* !CONFIG_VMAP_STACK */ static void thread_stack_free_rcu(struct rcu_head *rh) { @@ -372,7 +374,8 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack = NULL; } -#else /* !(THREAD_SIZE >= PAGE_SIZE) */ +# endif /* CONFIG_VMAP_STACK */ +# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ static struct kmem_cache *thread_stack_cache; @@ -411,8 +414,7 @@ void thread_stack_cache_init(void) BUG_ON(thread_stack_cache == NULL); } -#endif /* THREAD_SIZE >= PAGE_SIZE */ -#endif /* CONFIG_VMAP_STACK */ +# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; @@ -515,11 +517,11 @@ void vm_area_free(struct vm_area_struct *vma) static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm_area = task_stack_vm_area(tsk); + struct vm_struct *vm = task_stack_vm_area(tsk); int i; for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB, + mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, account * (PAGE_SIZE / 1024)); } else { void *stack = task_stack_page(tsk); @@ -535,12 +537,12 @@ void exit_task_stack_account(struct task_struct *tsk) account_kernel_stack(tsk, -1); if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm_area; + struct vm_struct *vm; int i; - vm_area = task_stack_vm_area(tsk); + vm = task_stack_vm_area(tsk); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - memcg_kmem_uncharge_page(vm_area->pages[i], 0); + memcg_kmem_uncharge_page(vm->pages[i], 0); } } -- cgit v1.2.3-59-g8ed1b From 84e437640ba43259c81048ded6adb5357a844360 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 May 2025 21:31:13 +0900 Subject: nilfs2: remove wbc->for_reclaim handling Since commit 013a07052a1a ("nilfs2: convert metadata aops from writepage to writepages"), nilfs_mdt_write_folio can't be called from reclaim context any more. Remove the code keyed of the wbc->for_reclaim flag, which is now only set for writing out swap or shmem pages inside the swap code, but never passed to file systems. Link: https://lkml.kernel.org/r/20250508054938.15894-7-hch@lst.de Link: https://lkml.kernel.org/r/20250516123417.6779-1-konishi.ryusuke@gmail.com Signed-off-by: Christoph Hellwig Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/mdt.c | 2 -- fs/nilfs2/segment.c | 16 ---------------- fs/nilfs2/segment.h | 1 - 3 files changed, 19 deletions(-) diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 2f850a18d6e7..946b0d3534a5 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -422,8 +422,6 @@ static int nilfs_mdt_write_folio(struct folio *folio, if (wbc->sync_mode == WB_SYNC_ALL) err = nilfs_construct_segment(sb); - else if (wbc->for_reclaim) - nilfs_flush_segment(sb, inode->i_ino); return err; } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 83970d97840b..61a4141f8d6b 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2221,22 +2221,6 @@ static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn) spin_unlock(&sci->sc_state_lock); } -/** - * nilfs_flush_segment - trigger a segment construction for resource control - * @sb: super block - * @ino: inode number of the file to be flushed out. - */ -void nilfs_flush_segment(struct super_block *sb, ino_t ino) -{ - struct the_nilfs *nilfs = sb->s_fs_info; - struct nilfs_sc_info *sci = nilfs->ns_writer; - - if (!sci || nilfs_doing_construction()) - return; - nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0); - /* assign bit 0 to data files */ -} - struct nilfs_segctor_wait_request { wait_queue_entry_t wq; __u32 seq; diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index f723f47ddc4e..4b39ed43ae72 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -226,7 +226,6 @@ extern void nilfs_relax_pressure_in_lock(struct super_block *); extern int nilfs_construct_segment(struct super_block *); extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *, loff_t, loff_t); -extern void nilfs_flush_segment(struct super_block *, ino_t); extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *, void **); -- cgit v1.2.3-59-g8ed1b From f68b5d165c906d11948c78e5d4b55a3e0975bba4 Mon Sep 17 00:00:00 2001 From: Casey Connolly Date: Sat, 17 May 2025 23:32:38 +0100 Subject: mailmap: update and consolidate Casey Connolly's name and email I've used several email addresses and a previous name to contribute. Consolidate all of these to my primary email and update my name. Link: https://lkml.kernel.org/r/20250517223237.15647-2-casey.connolly@linaro.org Signed-off-by: Casey Connolly Cc: Krzysztof Kozlowski Signed-off-by: Andrew Morton --- .mailmap | 3 +++ Documentation/devicetree/bindings/display/panel/lg,sw43408.yaml | 2 +- Documentation/devicetree/bindings/iio/adc/qcom,spmi-rradc.yaml | 2 +- .../devicetree/bindings/power/supply/qcom,pmi8998-charger.yaml | 2 +- MAINTAINERS | 2 +- arch/arm64/boot/dts/qcom/qcm6490-shift-otter.dts | 2 +- arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts | 2 +- drivers/gpu/drm/panel/panel-samsung-sofef00.c | 4 ++-- drivers/iio/adc/qcom-spmi-rradc.c | 4 ++-- drivers/power/supply/qcom_pmi8998_charger.c | 4 ++-- include/soc/qcom/qcom-spmi-pmic.h | 2 +- 11 files changed, 16 insertions(+), 13 deletions(-) diff --git a/.mailmap b/.mailmap index 1c70e51c789d..352b734c919c 100644 --- a/.mailmap +++ b/.mailmap @@ -154,6 +154,9 @@ Brian King Brian Silverman Bryan Tan Cai Huoqing +Casey Connolly +Casey Connolly +Casey Connolly Can Guo Carl Huang Carlos Bilbao diff --git a/Documentation/devicetree/bindings/display/panel/lg,sw43408.yaml b/Documentation/devicetree/bindings/display/panel/lg,sw43408.yaml index bbaaa783d184..2219d3d4ac43 100644 --- a/Documentation/devicetree/bindings/display/panel/lg,sw43408.yaml +++ b/Documentation/devicetree/bindings/display/panel/lg,sw43408.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: LG SW43408 1080x2160 DSI panel maintainers: - - Caleb Connolly + - Casey Connolly description: This panel is used on the Pixel 3, it is a 60hz OLED panel which diff --git a/Documentation/devicetree/bindings/iio/adc/qcom,spmi-rradc.yaml b/Documentation/devicetree/bindings/iio/adc/qcom,spmi-rradc.yaml index f39bc92c2b99..862e450da214 100644 --- a/Documentation/devicetree/bindings/iio/adc/qcom,spmi-rradc.yaml +++ b/Documentation/devicetree/bindings/iio/adc/qcom,spmi-rradc.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm's SPMI PMIC Round Robin ADC maintainers: - - Caleb Connolly + - Casey Connolly description: | The Qualcomm SPMI Round Robin ADC (RRADC) provides interface to clients to diff --git a/Documentation/devicetree/bindings/power/supply/qcom,pmi8998-charger.yaml b/Documentation/devicetree/bindings/power/supply/qcom,pmi8998-charger.yaml index a0f9d49ff8fb..90c7dc7632c5 100644 --- a/Documentation/devicetree/bindings/power/supply/qcom,pmi8998-charger.yaml +++ b/Documentation/devicetree/bindings/power/supply/qcom,pmi8998-charger.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Qualcomm PMI8998/PM660 Switch-Mode Battery Charger "2" maintainers: - - Caleb Connolly + - Casey Connolly properties: compatible: diff --git a/MAINTAINERS b/MAINTAINERS index f21f1dabb5fe..88eb1b18da68 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7460,7 +7460,7 @@ F: drivers/gpu/drm/panel/panel-jadard-jd9365da-h3.c DRM DRIVER FOR LG SW43408 PANELS M: Sumit Semwal -M: Caleb Connolly +M: Casey Connolly S: Maintained T: git https://gitlab.freedesktop.org/drm/misc/kernel.git F: Documentation/devicetree/bindings/display/panel/lg,sw43408.yaml diff --git a/arch/arm64/boot/dts/qcom/qcm6490-shift-otter.dts b/arch/arm64/boot/dts/qcom/qcm6490-shift-otter.dts index 75930f957696..bc288b0ac8f8 100644 --- a/arch/arm64/boot/dts/qcom/qcm6490-shift-otter.dts +++ b/arch/arm64/boot/dts/qcom/qcm6490-shift-otter.dts @@ -1,7 +1,7 @@ // SPDX-License-Identifier: BSD-3-Clause /* * Copyright (c) 2023, Luca Weiss - * Copyright (c) 2024, Caleb Connolly + * Copyright (c) 2024, Casey Connolly */ /dts-v1/; diff --git a/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts b/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts index ddb82ecb0a92..f8e962433331 100644 --- a/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts +++ b/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2022, Alexander Martinz - * Copyright (c) 2022, Caleb Connolly + * Copyright (c) 2022, Casey Connolly * Copyright (c) 2022, Dylan Van Assche */ diff --git a/drivers/gpu/drm/panel/panel-samsung-sofef00.c b/drivers/gpu/drm/panel/panel-samsung-sofef00.c index 04ce925b3d9d..b2782a5b5323 100644 --- a/drivers/gpu/drm/panel/panel-samsung-sofef00.c +++ b/drivers/gpu/drm/panel/panel-samsung-sofef00.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Copyright (c) 2020 Caleb Connolly +/* Copyright (c) 2020 Casey Connolly * Generated with linux-mdss-dsi-panel-driver-generator from vendor device tree: * Copyright (c) 2020, The Linux Foundation. All rights reserved. */ @@ -318,6 +318,6 @@ static struct mipi_dsi_driver sofef00_panel_driver = { module_mipi_dsi_driver(sofef00_panel_driver); -MODULE_AUTHOR("Caleb Connolly "); +MODULE_AUTHOR("Casey Connolly "); MODULE_DESCRIPTION("DRM driver for Samsung AMOLED DSI panels found in OnePlus 6/6T phones"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/iio/adc/qcom-spmi-rradc.c b/drivers/iio/adc/qcom-spmi-rradc.c index 63ebaf13ef19..f61ad0510f04 100644 --- a/drivers/iio/adc/qcom-spmi-rradc.c +++ b/drivers/iio/adc/qcom-spmi-rradc.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2016-2017, 2019, The Linux Foundation. All rights reserved. * Copyright (c) 2022 Linaro Limited. - * Author: Caleb Connolly + * Author: Casey Connolly * * This driver is for the Round Robin ADC found in the pmi8998 and pm660 PMICs. */ @@ -1016,5 +1016,5 @@ static struct platform_driver rradc_driver = { module_platform_driver(rradc_driver); MODULE_DESCRIPTION("QCOM SPMI PMIC RR ADC driver"); -MODULE_AUTHOR("Caleb Connolly "); +MODULE_AUTHOR("Casey Connolly "); MODULE_LICENSE("GPL"); diff --git a/drivers/power/supply/qcom_pmi8998_charger.c b/drivers/power/supply/qcom_pmi8998_charger.c index 74a8d8ed8d9f..c2f8f2e24398 100644 --- a/drivers/power/supply/qcom_pmi8998_charger.c +++ b/drivers/power/supply/qcom_pmi8998_charger.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2016-2019 The Linux Foundation. All rights reserved. * Copyright (c) 2023, Linaro Ltd. - * Author: Caleb Connolly + * Author: Casey Connolly * * This driver is for the switch-mode battery charger and boost * hardware found in pmi8998 and related PMICs. @@ -1045,6 +1045,6 @@ static struct platform_driver qcom_spmi_smb2 = { module_platform_driver(qcom_spmi_smb2); -MODULE_AUTHOR("Caleb Connolly "); +MODULE_AUTHOR("Casey Connolly "); MODULE_DESCRIPTION("Qualcomm SMB2 Charger Driver"); MODULE_LICENSE("GPL"); diff --git a/include/soc/qcom/qcom-spmi-pmic.h b/include/soc/qcom/qcom-spmi-pmic.h index a62d500a6fda..df3d3a0af98a 100644 --- a/include/soc/qcom/qcom-spmi-pmic.h +++ b/include/soc/qcom/qcom-spmi-pmic.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2022 Linaro. All rights reserved. - * Author: Caleb Connolly + * Author: Casey Connolly */ #ifndef __QCOM_SPMI_PMIC_H__ -- cgit v1.2.3-59-g8ed1b From 85915c6cabf74d1f693fd09d95c25d838b82c840 Mon Sep 17 00:00:00 2001 From: Sravan Kumar Gundu Date: Fri, 16 May 2025 17:40:31 +0000 Subject: kernel/panic.c: format kernel-doc comments kernel-doc function comment don't follows documentation commenting style misinterpreting arguments description with function description. Please see latest docs generated before applying this patch https://docs.kernel.org/driver-api/basics.html#c.panic Link: https://lkml.kernel.org/r/20250516174031.2937-1-sravankumarlpu@gmail.com Signed-off-by: Sravan Kumar Gundu Cc: Greg Kroah-Hartman Cc: Jani Nikula Cc: John Ogness Cc: Petr Mladek Cc: Shuah Khan Signed-off-by: Andrew Morton --- kernel/panic.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index a3889f38153d..1f52922d1b2e 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -277,12 +277,10 @@ static void panic_other_cpus_shutdown(bool crash_kexec) } /** - * panic - halt the system - * @fmt: The text string to print + * panic - halt the system + * @fmt: The text string to print * - * Display a message, then perform cleanups. - * - * This function never returns. + * Display a message, then perform cleanups. This function never returns. */ void panic(const char *fmt, ...) { -- cgit v1.2.3-59-g8ed1b From 3545414f2590177a76f86c985130dc4824d3adc9 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 15 May 2025 17:52:11 +0200 Subject: scripts/gdb/symbols: factor out get_vmlinux() Patch series "scripts/gdb/symbols: determine KASLR offset on s390 during early boot". I noticed that debugging s390 early boot using the support I introduced in commit 28939c3e9925 ("scripts/gdb/symbols: determine KASLR offset on s390") does not work. The reason is that decompressor does not provide the vmcoreinfo note, so KASLR offset needs to be extracted in a different way, which this series implements. Patches 1-2 are trivial refactorings, and patch 3 is the implementation. This patch (of 3): Move the code that determines the current vmlinux file into a separate function. It will be useful later in order to analyze the kernel image in physical memory during s390 early boot. Link: https://lkml.kernel.org/r/20250515155811.114392-1-iii@linux.ibm.com Link: https://lkml.kernel.org/r/20250515155811.114392-2-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Cc: Alexander Gordeev Cc: Heiko Carstens Cc: Jan Kiszka Cc: Kieran Bingham Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- scripts/gdb/linux/symbols.py | 6 +----- scripts/gdb/linux/utils.py | 9 +++++++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py index b255177301e9..25c4627c60e5 100644 --- a/scripts/gdb/linux/symbols.py +++ b/scripts/gdb/linux/symbols.py @@ -178,11 +178,7 @@ lx-symbols command.""" saved_states.append({'breakpoint': bp, 'enabled': bp.enabled}) # drop all current symbols and reload vmlinux - orig_vmlinux = 'vmlinux' - for obj in gdb.objfiles(): - if (obj.filename.endswith('vmlinux') or - obj.filename.endswith('vmlinux.debug')): - orig_vmlinux = obj.filename + orig_vmlinux = utils.get_vmlinux() gdb.execute("symbol-file", to_string=True) kerneloffset = get_kerneloffset() if kerneloffset is None: diff --git a/scripts/gdb/linux/utils.py b/scripts/gdb/linux/utils.py index 877404e92dbb..6e125830d3f2 100644 --- a/scripts/gdb/linux/utils.py +++ b/scripts/gdb/linux/utils.py @@ -251,3 +251,12 @@ def parse_vmcore(s): else: kerneloffset = int(match.group(1), 16) return VmCore(kerneloffset=kerneloffset) + + +def get_vmlinux(): + vmlinux = 'vmlinux' + for obj in gdb.objfiles(): + if (obj.filename.endswith('vmlinux') or + obj.filename.endswith('vmlinux.debug')): + vmlinux = obj.filename + return vmlinux -- cgit v1.2.3-59-g8ed1b From e97c4a27cb9c4c06fdc6d0760d7ea031c98b58a5 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 15 May 2025 17:52:12 +0200 Subject: scripts/gdb/symbols: factor out pagination_off() Move the code that turns off pagination into a separate function. It will be useful later in order to prevent hangs when loading symbols for kernel image in physical memory during s390 early boot. Link: https://lkml.kernel.org/r/20250515155811.114392-3-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Cc: Alexander Gordeev Cc: Heiko Carstens Cc: Jan Kiszka Cc: Kieran Bingham Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- scripts/gdb/linux/symbols.py | 20 +++++++------------- scripts/gdb/linux/utils.py | 11 +++++++++++ 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py index 25c4627c60e5..0c7af712c44c 100644 --- a/scripts/gdb/linux/symbols.py +++ b/scripts/gdb/linux/symbols.py @@ -38,19 +38,13 @@ if hasattr(gdb, 'Breakpoint'): # Disable pagination while reporting symbol (re-)loading. # The console input is blocked in this context so that we would # get stuck waiting for the user to acknowledge paged output. - show_pagination = gdb.execute("show pagination", to_string=True) - pagination = show_pagination.endswith("on.\n") - gdb.execute("set pagination off") - - if module_name in cmd.loaded_modules: - gdb.write("refreshing all symbols to reload module " - "'{0}'\n".format(module_name)) - cmd.load_all_symbols() - else: - cmd.load_module_symbols(module) - - # restore pagination state - gdb.execute("set pagination %s" % ("on" if pagination else "off")) + with utils.pagination_off(): + if module_name in cmd.loaded_modules: + gdb.write("refreshing all symbols to reload module " + "'{0}'\n".format(module_name)) + cmd.load_all_symbols() + else: + cmd.load_module_symbols(module) return False diff --git a/scripts/gdb/linux/utils.py b/scripts/gdb/linux/utils.py index 6e125830d3f2..e11f6f67961a 100644 --- a/scripts/gdb/linux/utils.py +++ b/scripts/gdb/linux/utils.py @@ -260,3 +260,14 @@ def get_vmlinux(): obj.filename.endswith('vmlinux.debug')): vmlinux = obj.filename return vmlinux + + +@contextlib.contextmanager +def pagination_off(): + show_pagination = gdb.execute("show pagination", to_string=True) + pagination = show_pagination.endswith("on.\n") + gdb.execute("set pagination off") + try: + yield + finally: + gdb.execute("set pagination %s" % ("on" if pagination else "off")) -- cgit v1.2.3-59-g8ed1b From c164679bed3a5f0d235723c2395d9a5122b151c4 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 15 May 2025 17:52:13 +0200 Subject: scripts/gdb/symbols: determine KASLR offset on s390 during early boot Using lx-symbols during s390 early boot fails with: Error occurred in Python: 'utf-8' codec can't decode byte 0xcb in position 0: invalid continuation byte The reason is that s390 decompressor's startup_kernel() does not create vmcoreinfo note, and sets vmcore_info to kernel's physical base. This confuses get_vmcore_s390(). Fix by handling this special case. Extract vm_layout.kaslr_offset from the kernel image in physical memory, which is placed there by the decompressor using the __bootdata_preserved mechanism, and generate a synthetic vmcoreinfo note from it. Link: https://lkml.kernel.org/r/20250515155811.114392-4-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Cc: Alexander Gordeev Cc: Heiko Carstens Cc: Jan Kiszka Cc: Kieran Bingham Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- scripts/gdb/linux/symbols.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py index 0c7af712c44c..2332bd8eddf1 100644 --- a/scripts/gdb/linux/symbols.py +++ b/scripts/gdb/linux/symbols.py @@ -54,6 +54,18 @@ def get_vmcore_s390(): vmcore_info = 0x0e0c paddr_vmcoreinfo_note = gdb.parse_and_eval("*(unsigned long long *)" + hex(vmcore_info)) + if paddr_vmcoreinfo_note == 0 or paddr_vmcoreinfo_note & 1: + # In the early boot case, extract vm_layout.kaslr_offset from the + # vmlinux image in physical memory. + if paddr_vmcoreinfo_note == 0: + kaslr_offset_phys = 0 + else: + kaslr_offset_phys = paddr_vmcoreinfo_note - 1 + with utils.pagination_off(): + gdb.execute("symbol-file {0} -o {1}".format( + utils.get_vmlinux(), hex(kaslr_offset_phys))) + kaslr_offset = gdb.parse_and_eval("vm_layout.kaslr_offset") + return "KERNELOFFSET=" + hex(kaslr_offset)[2:] inferior = gdb.selected_inferior() elf_note = inferior.read_memory(paddr_vmcoreinfo_note, 12) n_namesz, n_descsz, n_type = struct.unpack(">III", elf_note) -- cgit v1.2.3-59-g8ed1b From 4496e1c1354bd4837bcc1414f6e1a4d042857903 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 21 May 2025 18:03:19 +0200 Subject: crash_dump, nvme: select CONFIGFS_FS as built-in Configfs can be configured as a loadable module, which causes a link-time failure for dm-crypt crash dump support: crash_dump_dm_crypt.c:(.text+0x3a4): undefined reference to `config_item_init_type_name' aarch64-linux-ld: kernel/crash_dump_dm_crypt.o: in function `configfs_dmcrypt_keys_init': crash_dump_dm_crypt.c:(.init.text+0x90): undefined reference to `config_group_init' aarch64-linux-ld: crash_dump_dm_crypt.c:(.init.text+0xb4): undefined reference to `configfs_register_subsystem' aarch64-linux-ld: crash_dump_dm_crypt.c:(.init.text+0xd8): undefined reference to `configfs_unregister_subsystem' This could be avoided with a dependency on CONFIGFS_FS=y, but the dependency has an additional problem of causing Kconfig dependency loops since most other uses select the symbol. Using a simple 'select CONFIGFS_FS' here in turn fails with CONFIG_DM_CRYPT=m, because that still only causes configfs to be a loadable module. The only version I found that fixes this reliably uses an additional Kconfig symbol to ensure the 'select' actually turns on configfs as builtin, with two additional changes to avoid dependency loops with nvme and sysfs. There is no compile-time dependency between configfs and sysfs, so selecting configfs from a driver with sysfs disabled does not cause link failures, only the default /sys/kernel/config mount point will not be created. Link: https://lkml.kernel.org/r/20250521160359.2132363-1-arnd@kernel.org Fixes: 6b23858fd63b ("crash_dump: make dm crypt keys persist for the kdump kernel") Fixes: 1fb470408497 ("nvme-loop: add configfs dependency") Signed-off-by: Arnd Bergmann Cc: Andreas Hindborg Cc: Breno Leitao Cc: Chaitanya Kulkarni Cc: Christoph Hellwig Cc: Coiby Xu Cc: Jens Axboe Cc: Sagi Grimberg Signed-off-by: Andrew Morton --- drivers/nvme/target/Kconfig | 2 +- fs/configfs/Kconfig | 1 - kernel/Kconfig.kexec | 8 +++++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 4c253b433bf7..4904097dfd49 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -3,7 +3,7 @@ config NVME_TARGET tristate "NVMe Target support" depends on BLOCK - depends on CONFIGFS_FS + select CONFIGFS_FS select NVME_KEYRING if NVME_TARGET_TCP_TLS select KEYS if NVME_TARGET_TCP_TLS select SGL_ALLOC diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig index 272b64456999..1fcd761fe7be 100644 --- a/fs/configfs/Kconfig +++ b/fs/configfs/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only config CONFIGFS_FS tristate "Userspace-driven configuration filesystem" - select SYSFS help configfs is a RAM-based filesystem that provides the converse of sysfs's functionality. Where sysfs is a filesystem-based diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index c3e180d5e4fd..ceadf4a66496 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -120,12 +120,18 @@ config CRASH_DM_CRYPT depends on KEXEC_FILE depends on CRASH_DUMP depends on DM_CRYPT - depends on CONFIGFS_FS help With this option enabled, user space can intereact with /sys/kernel/config/crash_dm_crypt_keys to make the dm crypt keys persistent for the dump-capture kernel. +config CRASH_DM_CRYPT_CONFIGS + def_tristate CRASH_DM_CRYPT + select CONFIGFS_FS + help + CRASH_DM_CRYPT cannot directly select CONFIGFS_FS, because that + is required to be built-in. + config CRASH_HOTPLUG bool "Update the crash elfcorehdr on system configuration changes" default y -- cgit v1.2.3-59-g8ed1b From 2e227ff5e2729b5f4e0771826e3f6b61dd6a1b6b Mon Sep 17 00:00:00 2001 From: Chanho Min Date: Wed, 21 May 2025 16:25:59 +0900 Subject: squashfs: add optional full compressed block caching The commit 93e72b3c612adcaca1 ("squashfs: migrate from ll_rw_block usage to BIO") removed caching of compressed blocks in SquashFS, causing fio performance regression in workloads with repeated file reads. Without caching, every read triggers disk I/O, severely impacting performance in tools like fio. This patch introduces a new CONFIG_SQUASHFS_COMP_CACHE_FULL Kconfig option to enable caching of all compressed blocks, restoring performance to pre-BIO migration levels. When enabled, all pages in a BIO are cached in the page cache, reducing disk I/O for repeated reads. The fio test results with this patch confirm the performance restoration: For example, fio tests (iodepth=1, numjobs=1, ioengine=psync) show a notable performance restoration: Disable CONFIG_SQUASHFS_COMP_CACHE_FULL: IOPS=815, BW=102MiB/s (107MB/s)(6113MiB/60001msec) Enable CONFIG_SQUASHFS_COMP_CACHE_FULL: IOPS=2223, BW=278MiB/s (291MB/s)(16.3GiB/59999msec) The tradeoff is increased memory usage due to caching all compressed blocks. The CONFIG_SQUASHFS_COMP_CACHE_FULL option allows users to enable this feature selectively, balancing performance and memory usage for workloads with frequent repeated reads. Link: https://lkml.kernel.org/r/20250521072559.2389-1-chanho.min@lge.com Signed-off-by: Chanho Min Reviewed-by Phillip Lougher Signed-off-by: Andrew Morton --- fs/squashfs/Kconfig | 21 +++++++++++++++++++++ fs/squashfs/block.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index b1091e70434a..a9602aae21ef 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -149,6 +149,27 @@ config SQUASHFS_XATTR If unsure, say N. +config SQUASHFS_COMP_CACHE_FULL + bool "Enable full caching of compressed blocks" + depends on SQUASHFS + default n + help + This option enables caching of all compressed blocks, Without caching, + repeated reads of the same files trigger excessive disk I/O, significantly + reducinng performance in workloads like fio-based benchmarks. + + For example, fio tests (iodepth=1, numjobs=1, ioengine=psync) show: + With caching: IOPS=2223, BW=278MiB/s (291MB/s) + Without caching: IOPS=815, BW=102MiB/s (107MB/s) + + Enabling this option restores performance to pre-regression levels by + caching all compressed blocks in the page cache, reducing disk I/O for + repeated reads. However, this increases memory usage, which may be a + concern in memory-constrained environments. + + Enable this option if your workload involves frequent repeated reads and + memory usage is not a limiting factor. If unsure, say N. + config SQUASHFS_ZLIB bool "Include support for ZLIB compressed file systems" depends on SQUASHFS diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 2dc730800f44..3061043e915c 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -88,6 +88,10 @@ static int squashfs_bio_read_cached(struct bio *fullbio, struct bio_vec *bv; int idx = 0; int err = 0; +#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL + struct page **cache_pages = kmalloc_array(page_count, + sizeof(void *), GFP_KERNEL | __GFP_ZERO); +#endif bio_for_each_segment_all(bv, fullbio, iter_all) { struct page *page = bv->bv_page; @@ -110,6 +114,11 @@ static int squashfs_bio_read_cached(struct bio *fullbio, head_to_cache = page; else if (idx == page_count - 1 && index + length != read_end) tail_to_cache = page; +#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL + /* Cache all pages in the BIO for repeated reads */ + else if (cache_pages) + cache_pages[idx] = page; +#endif if (!bio || idx != end_idx) { struct bio *new = bio_alloc_clone(bdev, fullbio, @@ -163,6 +172,25 @@ static int squashfs_bio_read_cached(struct bio *fullbio, } } +#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL + if (!cache_pages) + goto out; + + for (idx = 0; idx < page_count; idx++) { + if (!cache_pages[idx]) + continue; + int ret = add_to_page_cache_lru(cache_pages[idx], cache_mapping, + (read_start >> PAGE_SHIFT) + idx, + GFP_NOIO); + + if (!ret) { + SetPageUptodate(cache_pages[idx]); + unlock_page(cache_pages[idx]); + } + } + kfree(cache_pages); +out: +#endif return 0; } -- cgit v1.2.3-59-g8ed1b From 5ef2dccfcca8d864e2f4c5b1628a22b446bc009a Mon Sep 17 00:00:00 2001 From: Wang Yaxin Date: Wed, 21 May 2025 09:31:57 +0800 Subject: delayacct: remove redundant code and adjust indentation Remove redundant code and adjust indentation of xxx_delay_max/min. Link: https://lkml.kernel.org/r/20250521093157668iQrhhcMjA-th5LQf4-A3c@zte.com.cn Signed-off-by: Wang Yaxin Signed-off-by: Jiang Kun Cc: Balbir Singh Cc: xu xin Cc: Yang Yang Signed-off-by: Andrew Morton --- kernel/delayacct.c | 51 ++++++++++++++++----------------------------------- 1 file changed, 16 insertions(+), 35 deletions(-) diff --git a/kernel/delayacct.c b/kernel/delayacct.c index eb63a021ac04..30e7912ebb0d 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -14,6 +14,15 @@ #include #include +#define UPDATE_DELAY(type) \ +do { \ + d->type##_delay_max = tsk->delays->type##_delay_max; \ + d->type##_delay_min = tsk->delays->type##_delay_min; \ + tmp = d->type##_delay_total + tsk->delays->type##_delay; \ + d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \ + d->type##_count += tsk->delays->type##_count; \ +} while (0) + DEFINE_STATIC_KEY_FALSE(delayacct_key); int delayacct_on __read_mostly; /* Delay accounting turned on/off */ struct kmem_cache *delayacct_cache; @@ -173,41 +182,13 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ raw_spin_lock_irqsave(&tsk->delays->lock, flags); - d->blkio_delay_max = tsk->delays->blkio_delay_max; - d->blkio_delay_min = tsk->delays->blkio_delay_min; - tmp = d->blkio_delay_total + tsk->delays->blkio_delay; - d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; - d->swapin_delay_max = tsk->delays->swapin_delay_max; - d->swapin_delay_min = tsk->delays->swapin_delay_min; - tmp = d->swapin_delay_total + tsk->delays->swapin_delay; - d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; - d->freepages_delay_max = tsk->delays->freepages_delay_max; - d->freepages_delay_min = tsk->delays->freepages_delay_min; - tmp = d->freepages_delay_total + tsk->delays->freepages_delay; - d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; - d->thrashing_delay_max = tsk->delays->thrashing_delay_max; - d->thrashing_delay_min = tsk->delays->thrashing_delay_min; - tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; - d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; - d->compact_delay_max = tsk->delays->compact_delay_max; - d->compact_delay_min = tsk->delays->compact_delay_min; - tmp = d->compact_delay_total + tsk->delays->compact_delay; - d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; - d->wpcopy_delay_max = tsk->delays->wpcopy_delay_max; - d->wpcopy_delay_min = tsk->delays->wpcopy_delay_min; - tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay; - d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp; - d->irq_delay_max = tsk->delays->irq_delay_max; - d->irq_delay_min = tsk->delays->irq_delay_min; - tmp = d->irq_delay_total + tsk->delays->irq_delay; - d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp; - d->blkio_count += tsk->delays->blkio_count; - d->swapin_count += tsk->delays->swapin_count; - d->freepages_count += tsk->delays->freepages_count; - d->thrashing_count += tsk->delays->thrashing_count; - d->compact_count += tsk->delays->compact_count; - d->wpcopy_count += tsk->delays->wpcopy_count; - d->irq_count += tsk->delays->irq_count; + UPDATE_DELAY(blkio); + UPDATE_DELAY(swapin); + UPDATE_DELAY(freepages); + UPDATE_DELAY(thrashing); + UPDATE_DELAY(compact); + UPDATE_DELAY(wpcopy); + UPDATE_DELAY(irq); raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; -- cgit v1.2.3-59-g8ed1b From 375700bab5b150e876e42d894a9a7470881f8a61 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 23 May 2025 13:13:11 -0600 Subject: llist: make llist_add_batch() a static inline The function is small enough that it should be, and it's a (very) hot path for io_uring. Doing this actually reduces my vmlinux text size for my standard build/test box. Before: axboe@r7625 ~/g/linux (test)> size vmlinux text data bss dec hex filename 19892174 5938310 2470432 28300916 1afd674 vmlinux After: axboe@r7625 ~/g/linux (test)> size vmlinux text data bss dec hex filename 19891878 5938310 2470436 28300624 1afd550 vmlinux Link: https://lkml.kernel.org/r/f1d104c6-7ac8-457a-a53d-6bb741421b2f@kernel.dk Signed-off-by: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/llist.h | 23 ++++++++++++++++++++--- lib/llist.c | 22 ---------------------- 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/include/linux/llist.h b/include/linux/llist.h index 2c982ff7475a..27b17f64bcee 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -223,9 +223,26 @@ static inline struct llist_node *llist_next(struct llist_node *node) return node->next; } -extern bool llist_add_batch(struct llist_node *new_first, - struct llist_node *new_last, - struct llist_head *head); +/** + * llist_add_batch - add several linked entries in batch + * @new_first: first entry in batch to be added + * @new_last: last entry in batch to be added + * @head: the head for your lock-less list + * + * Return whether list is empty before adding. + */ +static inline bool llist_add_batch(struct llist_node *new_first, + struct llist_node *new_last, + struct llist_head *head) +{ + struct llist_node *first = READ_ONCE(head->first); + + do { + new_last->next = first; + } while (!try_cmpxchg(&head->first, &first, new_first)); + + return !first; +} static inline bool __llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, diff --git a/lib/llist.c b/lib/llist.c index f21d0cfbbaaa..f574c17a238e 100644 --- a/lib/llist.c +++ b/lib/llist.c @@ -14,28 +14,6 @@ #include #include - -/** - * llist_add_batch - add several linked entries in batch - * @new_first: first entry in batch to be added - * @new_last: last entry in batch to be added - * @head: the head for your lock-less list - * - * Return whether list is empty before adding. - */ -bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, - struct llist_head *head) -{ - struct llist_node *first = READ_ONCE(head->first); - - do { - new_last->next = first; - } while (!try_cmpxchg(&head->first, &first, new_first)); - - return !first; -} -EXPORT_SYMBOL_GPL(llist_add_batch); - /** * llist_del_first - delete the first entry of lock-less list * @head: the head for your lock-less list -- cgit v1.2.3-59-g8ed1b