From 57116ce17b04fde2fe30f0859df69d8dbe5809f6 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 6 Oct 2021 13:58:52 +0200 Subject: workqueue: fix state-dump console deadlock Console drivers often queue work while holding locks also taken in their console write paths, something which can lead to deadlocks on SMP when dumping workqueue state (e.g. sysrq-t or on suspend failures). For serial console drivers this could look like: CPU0 CPU1 ---- ---- show_workqueue_state(); lock(&pool->lock); lock(&port->lock); schedule_work(); lock(&pool->lock); printk(); lock(console_owner); lock(&port->lock); where workqueues are, for example, used to push data to the line discipline, process break signals and handle modem-status changes. Line disciplines and serdev drivers can also queue work on write-wakeup notifications, etc. Reworking every console driver to avoid queuing work while holding locks also taken in their write paths would complicate drivers and is neither desirable or feasible. Instead use the deferred-printk mechanism to avoid printing while holding pool locks when dumping workqueue state. Note that there are a few WARN_ON() assertions in the workqueue code which could potentially also trigger a deadlock. Hopefully the ongoing printk rework will provide a general solution for this eventually. This was originally reported after a lockdep splat when executing sysrq-t with the imx serial driver. Fixes: 3494fc30846d ("workqueue: dump workqueues on sysrq-t") Cc: stable@vger.kernel.org # 4.0 Reported-by: Fabio Estevam Tested-by: Fabio Estevam Signed-off-by: Johan Hovold Reviewed-by: John Ogness Signed-off-by: Tejun Heo --- kernel/workqueue.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 33a6b4a2443d..1b3eb1e9531f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4830,8 +4830,16 @@ void show_workqueue_state(void) for_each_pwq(pwq, wq) { raw_spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq->nr_active || !list_empty(&pwq->inactive_works)) + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { + /* + * Defer printing to avoid deadlocks in console + * drivers that queue work while holding locks + * also taken in their write paths. + */ + printk_deferred_enter(); show_pwq(pwq); + printk_deferred_exit(); + } raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. @@ -4849,7 +4857,12 @@ void show_workqueue_state(void) raw_spin_lock_irqsave(&pool->lock, flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; - + /* + * Defer printing to avoid deadlocks in console drivers that + * queue work while holding locks also taken in their write + * paths. + */ + printk_deferred_enter(); pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); pr_cont(" hung=%us workers=%d", @@ -4864,6 +4877,7 @@ void show_workqueue_state(void) first = false; } pr_cont("\n"); + printk_deferred_exit(); next_pool: raw_spin_unlock_irqrestore(&pool->lock, flags); /* -- cgit v1.3-14-g43fede From d25302e46592c97d29f70ccb1be558df31a9a360 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 17 Oct 2021 20:04:02 +0800 Subject: workqueue: make sysfs of unbound kworker cpumask more clever Some unfriendly component, such as dpdk, write the same mask to unbound kworker cpumask again and again. Every time it write to this interface some work is queue to cpu, even though the mask is same with the original mask. So, fix it by return success and do nothing if the cpumask is equal with the old one. Signed-off-by: Mengen Sun Signed-off-by: Menglong Dong Signed-off-by: Tejun Heo --- kernel/workqueue.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1b3eb1e9531f..76988f39ed5a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5384,9 +5384,6 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) int ret = -EINVAL; cpumask_var_t saved_cpumask; - if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) - return -ENOMEM; - /* * Not excluding isolated cpus on purpose. * If the user wishes to include them, we allow that. @@ -5394,6 +5391,15 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); + if (cpumask_equal(cpumask, wq_unbound_cpumask)) { + ret = 0; + goto out_unlock; + } + + if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) { + ret = -ENOMEM; + goto out_unlock; + } /* save the old wq_unbound_cpumask. */ cpumask_copy(saved_cpumask, wq_unbound_cpumask); @@ -5406,10 +5412,11 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) if (ret < 0) cpumask_copy(wq_unbound_cpumask, saved_cpumask); + free_cpumask_var(saved_cpumask); +out_unlock: apply_wqattrs_unlock(); } - free_cpumask_var(saved_cpumask); return ret; } -- cgit v1.3-14-g43fede From 55df0933be74bd2e52aba0b67eb743ae0feabe7e Mon Sep 17 00:00:00 2001 From: Imran Khan Date: Wed, 20 Oct 2021 14:09:00 +1100 Subject: workqueue: Introduce show_one_worker_pool and show_one_workqueue. Currently show_workqueue_state shows the state of all workqueues and of all worker pools. In certain cases we may need to dump state of only a specific workqueue or worker pool. For example in destroy_workqueue we only need to show state of the workqueue which is getting destroyed. So rename show_workqueue_state to show_all_workqueues(to signify it dumps state of all busy workqueues) and divide it into more granular functions (show_one_workqueue and show_one_worker_pool), that would show states of individual workqueues and worker pools and can be used in cases such as the one mentioned above. Also, as mentioned earlier, make destroy_workqueue dump data pertaining to only the workqueue that is being destroyed and make user(s) of earlier interface(show_workqueue_state), use new interface (show_all_workqueues). Signed-off-by: Imran Khan Signed-off-by: Tejun Heo --- drivers/tty/sysrq.c | 2 +- include/linux/workqueue.h | 3 +- kernel/power/process.c | 2 +- kernel/workqueue.c | 172 ++++++++++++++++++++++++++-------------------- 4 files changed, 100 insertions(+), 79 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index c911196ac893..8d0f07509ca7 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -296,7 +296,7 @@ static const struct sysrq_key_op sysrq_showregs_op = { static void sysrq_handle_showstate(int key) { show_state(); - show_workqueue_state(); + show_all_workqueues(); } static const struct sysrq_key_op sysrq_showstate_op = { .handler = sysrq_handle_showstate, diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 74d3c1efd9bb..7fee9b6cfede 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -469,7 +469,8 @@ extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); extern __printf(1, 2) void set_worker_desc(const char *fmt, ...); extern void print_worker_info(const char *log_lvl, struct task_struct *task); -extern void show_workqueue_state(void); +extern void show_all_workqueues(void); +extern void show_one_workqueue(struct workqueue_struct *wq); extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task); /** diff --git a/kernel/power/process.c b/kernel/power/process.c index 37401c99b7d7..b7e7798637b8 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -94,7 +94,7 @@ static int try_to_freeze_tasks(bool user_only) todo - wq_busy, wq_busy); if (wq_busy) - show_workqueue_state(); + show_all_workqueues(); if (!wakeup || pm_debug_messages_on) { read_lock(&tasklist_lock); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 76988f39ed5a..1a7df882f55e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -375,6 +375,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); static void workqueue_sysfs_unregister(struct workqueue_struct *wq); static void show_pwq(struct pool_workqueue *pwq); +static void show_one_worker_pool(struct worker_pool *pool); #define CREATE_TRACE_POINTS #include @@ -4447,7 +4448,7 @@ void destroy_workqueue(struct workqueue_struct *wq) raw_spin_unlock_irq(&pwq->pool->lock); mutex_unlock(&wq->mutex); mutex_unlock(&wq_pool_mutex); - show_workqueue_state(); + show_one_workqueue(wq); return; } raw_spin_unlock_irq(&pwq->pool->lock); @@ -4797,97 +4798,116 @@ static void show_pwq(struct pool_workqueue *pwq) } /** - * show_workqueue_state - dump workqueue state - * - * Called from a sysrq handler or try_to_freeze_tasks() and prints out - * all busy workqueues and pools. + * show_one_workqueue - dump state of specified workqueue + * @wq: workqueue whose state will be printed */ -void show_workqueue_state(void) +void show_one_workqueue(struct workqueue_struct *wq) { - struct workqueue_struct *wq; - struct worker_pool *pool; + struct pool_workqueue *pwq; + bool idle = true; unsigned long flags; - int pi; - - rcu_read_lock(); - pr_info("Showing busy workqueues and worker pools:\n"); - - list_for_each_entry_rcu(wq, &workqueues, list) { - struct pool_workqueue *pwq; - bool idle = true; - - for_each_pwq(pwq, wq) { - if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { - idle = false; - break; - } + for_each_pwq(pwq, wq) { + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { + idle = false; + break; } - if (idle) - continue; + } + if (idle) /* Nothing to print for idle workqueue */ + return; - pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); + pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); - for_each_pwq(pwq, wq) { - raw_spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { - /* - * Defer printing to avoid deadlocks in console - * drivers that queue work while holding locks - * also taken in their write paths. - */ - printk_deferred_enter(); - show_pwq(pwq); - printk_deferred_exit(); - } - raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); + for_each_pwq(pwq, wq) { + raw_spin_lock_irqsave(&pwq->pool->lock, flags); + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { /* - * We could be printing a lot from atomic context, e.g. - * sysrq-t -> show_workqueue_state(). Avoid triggering - * hard lockup. + * Defer printing to avoid deadlocks in console + * drivers that queue work while holding locks + * also taken in their write paths. */ - touch_nmi_watchdog(); - } - } - - for_each_pool(pool, pi) { - struct worker *worker; - bool first = true; - - raw_spin_lock_irqsave(&pool->lock, flags); - if (pool->nr_workers == pool->nr_idle) - goto next_pool; - /* - * Defer printing to avoid deadlocks in console drivers that - * queue work while holding locks also taken in their write - * paths. - */ - printk_deferred_enter(); - pr_info("pool %d:", pool->id); - pr_cont_pool_info(pool); - pr_cont(" hung=%us workers=%d", - jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000, - pool->nr_workers); - if (pool->manager) - pr_cont(" manager: %d", - task_pid_nr(pool->manager->task)); - list_for_each_entry(worker, &pool->idle_list, entry) { - pr_cont(" %s%d", first ? "idle: " : "", - task_pid_nr(worker->task)); - first = false; + printk_deferred_enter(); + show_pwq(pwq); + printk_deferred_exit(); } - pr_cont("\n"); - printk_deferred_exit(); - next_pool: - raw_spin_unlock_irqrestore(&pool->lock, flags); + raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. - * sysrq-t -> show_workqueue_state(). Avoid triggering + * sysrq-t -> show_all_workqueues(). Avoid triggering * hard lockup. */ touch_nmi_watchdog(); } +} + +/** + * show_one_worker_pool - dump state of specified worker pool + * @pool: worker pool whose state will be printed + */ +static void show_one_worker_pool(struct worker_pool *pool) +{ + struct worker *worker; + bool first = true; + unsigned long flags; + + raw_spin_lock_irqsave(&pool->lock, flags); + if (pool->nr_workers == pool->nr_idle) + goto next_pool; + /* + * Defer printing to avoid deadlocks in console drivers that + * queue work while holding locks also taken in their write + * paths. + */ + printk_deferred_enter(); + pr_info("pool %d:", pool->id); + pr_cont_pool_info(pool); + pr_cont(" hung=%us workers=%d", + jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000, + pool->nr_workers); + if (pool->manager) + pr_cont(" manager: %d", + task_pid_nr(pool->manager->task)); + list_for_each_entry(worker, &pool->idle_list, entry) { + pr_cont(" %s%d", first ? "idle: " : "", + task_pid_nr(worker->task)); + first = false; + } + pr_cont("\n"); + printk_deferred_exit(); +next_pool: + raw_spin_unlock_irqrestore(&pool->lock, flags); + /* + * We could be printing a lot from atomic context, e.g. + * sysrq-t -> show_all_workqueues(). Avoid triggering + * hard lockup. + */ + touch_nmi_watchdog(); + +} + +/** + * show_all_workqueues - dump workqueue state + * + * Called from a sysrq handler or try_to_freeze_tasks() and prints out + * all busy workqueues and pools. + */ +void show_all_workqueues(void) +{ + struct workqueue_struct *wq; + struct worker_pool *pool; + int pi; + + rcu_read_lock(); + + pr_info("Showing busy workqueues and worker pools:\n"); + + list_for_each_entry_rcu(wq, &workqueues, list) + show_one_workqueue(wq); + + for_each_pool(pool, pi) + show_one_worker_pool(pool); + rcu_read_unlock(); } @@ -5876,7 +5896,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) rcu_read_unlock(); if (lockup_detected) - show_workqueue_state(); + show_all_workqueues(); wq_watchdog_reset_touched(); mod_timer(&wq_watchdog_timer, jiffies + thresh); -- cgit v1.3-14-g43fede From f70da745be4d4c367568b8345f50db1ae04efcb2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:35:50 -0700 Subject: workqueue, kasan: avoid alloc_pages() when recording stack Shuah Khan reported: | When CONFIG_PROVE_RAW_LOCK_NESTING=y and CONFIG_KASAN are enabled, | kasan_record_aux_stack() runs into "BUG: Invalid wait context" when | it tries to allocate memory attempting to acquire spinlock in page | allocation code while holding workqueue pool raw_spinlock. | | There are several instances of this problem when block layer tries | to __queue_work(). Call trace from one of these instances is below: | | kblockd_mod_delayed_work_on() | mod_delayed_work_on() | __queue_delayed_work() | __queue_work() (rcu_read_lock, raw_spin_lock pool->lock held) | insert_work() | kasan_record_aux_stack() | kasan_save_stack() | stack_depot_save() | alloc_pages() | __alloc_pages() | get_page_from_freelist() | rm_queue() | rm_queue_pcplist() | local_lock_irqsave(&pagesets.lock, flags); | [ BUG: Invalid wait context triggered ] The default kasan_record_aux_stack() calls stack_depot_save() with GFP_NOWAIT, which in turn can then call alloc_pages(GFP_NOWAIT, ...). In general, however, it is not even possible to use either GFP_ATOMIC nor GFP_NOWAIT in certain non-preemptive contexts, including raw_spin_locks (see gfp.h and commmit ab00db216c9c7). Fix it by instructing stackdepot to not expand stack storage via alloc_pages() in case it runs out by using kasan_record_aux_stack_noalloc(). While there is an increased risk of failing to insert the stack trace, this is typically unlikely, especially if the same insertion had already succeeded previously (stack depot hit). For frequent calls from the same location, it therefore becomes extremely unlikely that kasan_record_aux_stack_noalloc() fails. Link: https://lkml.kernel.org/r/20210902200134.25603-1-skhan@linuxfoundation.org Link: https://lkml.kernel.org/r/20210913112609.2651084-7-elver@google.com Signed-off-by: Marco Elver Reported-by: Shuah Khan Tested-by: Shuah Khan Acked-by: Sebastian Andrzej Siewior Acked-by: Tejun Heo Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: "Gustavo A. R. Silva" Cc: Lai Jiangshan Cc: Taras Madan Cc: Thomas Gleixner Cc: Vijayanand Jitta Cc: Vinayak Menon Cc: Walter Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1b3eb1e9531f..12e0d9cb4ac7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1350,7 +1350,7 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct worker_pool *pool = pwq->pool; /* record the work call stack in order to print it in KASAN reports */ - kasan_record_aux_stack(work); + kasan_record_aux_stack_noalloc(work); /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); -- cgit v1.3-14-g43fede