From 4d43d395fed124631ca02356c711facb90185175 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Wed, 23 Jan 2019 09:44:12 +0900
Subject: workqueue: Try to catch flush_work() without INIT_WORK().

syzbot found a flush_work() caller who forgot to call INIT_WORK()
because that work_struct was allocated by kzalloc() [1]. But the message

  INFO: trying to register non-static key.
  the code is fine but needs lockdep annotation.
  turning off the locking correctness validator.

by lock_map_acquire() is failing to tell that INIT_WORK() is missing.

Since flush_work() without INIT_WORK() is a bug, and INIT_WORK() should
set ->func field to non-zero, let's warn if ->func field is zero.

[1] https://syzkaller.appspot.com/bug?id=a5954455fcfa51c29ca2ab55b203076337e1c770

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 392be4b252f6..a503ad9d0aec 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2908,6 +2908,9 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
 	if (WARN_ON(!wq_online))
 		return false;
 
+	if (WARN_ON(!work->func))
+		return false;
+
 	if (!from_cancel) {
 		lock_map_acquire(&work->lockdep_map);
 		lock_map_release(&work->lockdep_map);
-- 
cgit v1.3-7-g2ca7


From 8204e0c1113d6b7f599bcd7ebfbfde72e76c102f Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Date: Tue, 22 Jan 2019 10:39:26 -0800
Subject: workqueue: Provide queue_work_node to queue work near a given NUMA
 node

Provide a new function, queue_work_node, which is meant to schedule work on
a "random" CPU of the requested NUMA node. The main motivation for this is
to help assist asynchronous init to better improve boot times for devices
that are local to a specific node.

For now we just default to the first CPU that is in the intersection of the
cpumask of the node and the online cpumask. The only exception is if the
CPU is local to the node we will just use the current CPU. This should work
for our purposes as we are currently only using this for unbound work so
the CPU will be translated to a node anyway instead of being directly used.

As we are only using the first CPU to represent the NUMA node for now I am
limiting the scope of the function so that it can only be used with unbound
workqueues.

Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/workqueue.h |  2 ++
 kernel/workqueue.c        | 84 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

(limited to 'kernel/workqueue.c')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 60d673e15632..1f50c1e586e7 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -463,6 +463,8 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask);
 
 extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
 			struct work_struct *work);
+extern bool queue_work_node(int node, struct workqueue_struct *wq,
+			    struct work_struct *work);
 extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 			struct delayed_work *work, unsigned long delay);
 extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 392be4b252f6..d5a26e456f7a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1492,6 +1492,90 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL(queue_work_on);
 
+/**
+ * workqueue_select_cpu_near - Select a CPU based on NUMA node
+ * @node: NUMA node ID that we want to select a CPU from
+ *
+ * This function will attempt to find a "random" cpu available on a given
+ * node. If there are no CPUs available on the given node it will return
+ * WORK_CPU_UNBOUND indicating that we should just schedule to any
+ * available CPU if we need to schedule this work.
+ */
+static int workqueue_select_cpu_near(int node)
+{
+	int cpu;
+
+	/* No point in doing this if NUMA isn't enabled for workqueues */
+	if (!wq_numa_enabled)
+		return WORK_CPU_UNBOUND;
+
+	/* Delay binding to CPU if node is not valid or online */
+	if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
+		return WORK_CPU_UNBOUND;
+
+	/* Use local node/cpu if we are already there */
+	cpu = raw_smp_processor_id();
+	if (node == cpu_to_node(cpu))
+		return cpu;
+
+	/* Use "random" otherwise know as "first" online CPU of node */
+	cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);
+
+	/* If CPU is valid return that, otherwise just defer */
+	return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND;
+}
+
+/**
+ * queue_work_node - queue work on a "random" cpu for a given NUMA node
+ * @node: NUMA node that we are targeting the work for
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * We queue the work to a "random" CPU within a given NUMA node. The basic
+ * idea here is to provide a way to somehow associate work with a given
+ * NUMA node.
+ *
+ * This function will only make a best effort attempt at getting this onto
+ * the right NUMA node. If no node is requested or the requested node is
+ * offline then we just fall back to standard queue_work behavior.
+ *
+ * Currently the "random" CPU ends up being the first available CPU in the
+ * intersection of cpu_online_mask and the cpumask of the node, unless we
+ * are running on the node. In that case we just use the current CPU.
+ *
+ * Return: %false if @work was already on a queue, %true otherwise.
+ */
+bool queue_work_node(int node, struct workqueue_struct *wq,
+		     struct work_struct *work)
+{
+	unsigned long flags;
+	bool ret = false;
+
+	/*
+	 * This current implementation is specific to unbound workqueues.
+	 * Specifically we only return the first available CPU for a given
+	 * node instead of cycling through individual CPUs within the node.
+	 *
+	 * If this is used with a per-cpu workqueue then the logic in
+	 * workqueue_select_cpu_near would need to be updated to allow for
+	 * some round robin type logic.
+	 */
+	WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND));
+
+	local_irq_save(flags);
+
+	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+		int cpu = workqueue_select_cpu_near(node);
+
+		__queue_work(cpu, wq, work);
+		ret = true;
+	}
+
+	local_irq_restore(flags);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(queue_work_node);
+
 void delayed_work_timer_fn(struct timer_list *t)
 {
 	struct delayed_work *dwork = from_timer(dwork, t, timer);
-- 
cgit v1.3-7-g2ca7


From 8bdc6201785d0b5231bfd77bf2b62726395092d7 Mon Sep 17 00:00:00 2001
From: Liu Song <fishland@aliyun.com>
Date: Tue, 19 Feb 2019 23:53:27 +0800
Subject: workqueue: fix typo in comment

qeueue/queue

Signed-off-by: Liu Song <liu.song11@zte.com.cn>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a503ad9d0aec..29a4de4025be 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -646,7 +646,7 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
 	 * The following mb guarantees that previous clear of a PENDING bit
 	 * will not be reordered with any speculative LOADS or STORES from
 	 * work->current_func, which is executed afterwards.  This possible
-	 * reordering can lead to a missed execution on attempt to qeueue
+	 * reordering can lead to a missed execution on attempt to queue
 	 * the same @work.  E.g. consider this case:
 	 *
 	 *   CPU#0                         CPU#1
-- 
cgit v1.3-7-g2ca7


From 669de8bda87b92ab9a2fc663b3f5743c2ad1ae9f Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 14 Feb 2019 15:00:54 -0800
Subject: kernel/workqueue: Use dynamic lockdep keys for workqueues

The following commit:

  87915adc3f0a ("workqueue: re-add lockdep dependencies for flushing")

improved deadlock checking in the workqueue implementation. Unfortunately
that patch also introduced a few false positive lockdep complaints.

This patch suppresses these false positives by allocating the workqueue mutex
lockdep key dynamically.

An example of a false positive lockdep complaint suppressed by this patch
can be found below. The root cause of the lockdep complaint shown below
is that the direct I/O code can call alloc_workqueue() from inside a work
item created by another alloc_workqueue() call and that both workqueues
share the same lockdep key. This patch avoids that that lockdep complaint
is triggered by allocating the work queue lockdep keys dynamically.

In other words, this patch guarantees that a unique lockdep key is
associated with each work queue mutex.

  ======================================================
  WARNING: possible circular locking dependency detected
  4.19.0-dbg+ #1 Not tainted
  fio/4129 is trying to acquire lock:
  00000000a01cfe1a ((wq_completion)"dio/%s"sb->s_id){+.+.}, at: flush_workqueue+0xd0/0x970

  but task is already holding lock:
  00000000a0acecf9 (&sb->s_type->i_mutex_key#14){+.+.}, at: ext4_file_write_iter+0x154/0x710

  which lock already depends on the new lock.

  the existing dependency chain (in reverse order) is:

  -> #2 (&sb->s_type->i_mutex_key#14){+.+.}:
         down_write+0x3d/0x80
         __generic_file_fsync+0x77/0xf0
         ext4_sync_file+0x3c9/0x780
         vfs_fsync_range+0x66/0x100
         dio_complete+0x2f5/0x360
         dio_aio_complete_work+0x1c/0x20
         process_one_work+0x481/0x9f0
         worker_thread+0x63/0x5a0
         kthread+0x1cf/0x1f0
         ret_from_fork+0x24/0x30

  -> #1 ((work_completion)(&dio->complete_work)){+.+.}:
         process_one_work+0x447/0x9f0
         worker_thread+0x63/0x5a0
         kthread+0x1cf/0x1f0
         ret_from_fork+0x24/0x30

  -> #0 ((wq_completion)"dio/%s"sb->s_id){+.+.}:
         lock_acquire+0xc5/0x200
         flush_workqueue+0xf3/0x970
         drain_workqueue+0xec/0x220
         destroy_workqueue+0x23/0x350
         sb_init_dio_done_wq+0x6a/0x80
         do_blockdev_direct_IO+0x1f33/0x4be0
         __blockdev_direct_IO+0x79/0x86
         ext4_direct_IO+0x5df/0xbb0
         generic_file_direct_write+0x119/0x220
         __generic_file_write_iter+0x131/0x2d0
         ext4_file_write_iter+0x3fa/0x710
         aio_write+0x235/0x330
         io_submit_one+0x510/0xeb0
         __x64_sys_io_submit+0x122/0x340
         do_syscall_64+0x71/0x220
         entry_SYSCALL_64_after_hwframe+0x49/0xbe

  other info that might help us debug this:

  Chain exists of:
    (wq_completion)"dio/%s"sb->s_id --> (work_completion)(&dio->complete_work) --> &sb->s_type->i_mutex_key#14

   Possible unsafe locking scenario:

         CPU0                    CPU1
         ----                    ----
    lock(&sb->s_type->i_mutex_key#14);
                                 lock((work_completion)(&dio->complete_work));
                                 lock(&sb->s_type->i_mutex_key#14);
    lock((wq_completion)"dio/%s"sb->s_id);

   *** DEADLOCK ***

  1 lock held by fio/4129:
   #0: 00000000a0acecf9 (&sb->s_type->i_mutex_key#14){+.+.}, at: ext4_file_write_iter+0x154/0x710

  stack backtrace:
  CPU: 3 PID: 4129 Comm: fio Not tainted 4.19.0-dbg+ #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
  Call Trace:
   dump_stack+0x86/0xc5
   print_circular_bug.isra.32+0x20a/0x218
   __lock_acquire+0x1c68/0x1cf0
   lock_acquire+0xc5/0x200
   flush_workqueue+0xf3/0x970
   drain_workqueue+0xec/0x220
   destroy_workqueue+0x23/0x350
   sb_init_dio_done_wq+0x6a/0x80
   do_blockdev_direct_IO+0x1f33/0x4be0
   __blockdev_direct_IO+0x79/0x86
   ext4_direct_IO+0x5df/0xbb0
   generic_file_direct_write+0x119/0x220
   __generic_file_write_iter+0x131/0x2d0
   ext4_file_write_iter+0x3fa/0x710
   aio_write+0x235/0x330
   io_submit_one+0x510/0xeb0
   __x64_sys_io_submit+0x122/0x340
   do_syscall_64+0x71/0x220
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Johannes Berg <johannes.berg@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Link: https://lkml.kernel.org/r/20190214230058.196511-20-bvanassche@acm.org
[ Reworked the changelog a bit. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/workqueue.h | 28 ++++------------------
 kernel/workqueue.c        | 59 +++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 54 insertions(+), 33 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 60d673e15632..d9a1a480e920 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -390,43 +390,23 @@ extern struct workqueue_struct *system_freezable_wq;
 extern struct workqueue_struct *system_power_efficient_wq;
 extern struct workqueue_struct *system_freezable_power_efficient_wq;
 
-extern struct workqueue_struct *
-__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
-	struct lock_class_key *key, const char *lock_name, ...) __printf(1, 6);
-
 /**
  * alloc_workqueue - allocate a workqueue
  * @fmt: printf format for the name of the workqueue
  * @flags: WQ_* flags
  * @max_active: max in-flight work items, 0 for default
- * @args...: args for @fmt
+ * remaining args: args for @fmt
  *
  * Allocate a workqueue with the specified parameters.  For detailed
  * information on WQ_* flags, please refer to
  * Documentation/core-api/workqueue.rst.
  *
- * The __lock_name macro dance is to guarantee that single lock_class_key
- * doesn't end up with different namesm, which isn't allowed by lockdep.
- *
  * RETURNS:
  * Pointer to the allocated workqueue on success, %NULL on failure.
  */
-#ifdef CONFIG_LOCKDEP
-#define alloc_workqueue(fmt, flags, max_active, args...)		\
-({									\
-	static struct lock_class_key __key;				\
-	const char *__lock_name;					\
-									\
-	__lock_name = "(wq_completion)"#fmt#args;			\
-									\
-	__alloc_workqueue_key((fmt), (flags), (max_active),		\
-			      &__key, __lock_name, ##args);		\
-})
-#else
-#define alloc_workqueue(fmt, flags, max_active, args...)		\
-	__alloc_workqueue_key((fmt), (flags), (max_active),		\
-			      NULL, NULL, ##args)
-#endif
+struct workqueue_struct *alloc_workqueue(const char *fmt,
+					 unsigned int flags,
+					 int max_active, ...);
 
 /**
  * alloc_ordered_workqueue - allocate an ordered workqueue
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fc5d23d752a5..e163e7a7f5e5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -259,6 +259,8 @@ struct workqueue_struct {
 	struct wq_device	*wq_dev;	/* I: for sysfs interface */
 #endif
 #ifdef CONFIG_LOCKDEP
+	char			*lock_name;
+	struct lock_class_key	key;
 	struct lockdep_map	lockdep_map;
 #endif
 	char			name[WQ_NAME_LEN]; /* I: workqueue name */
@@ -3337,11 +3339,49 @@ static int init_worker_pool(struct worker_pool *pool)
 	return 0;
 }
 
+#ifdef CONFIG_LOCKDEP
+static void wq_init_lockdep(struct workqueue_struct *wq)
+{
+	char *lock_name;
+
+	lockdep_register_key(&wq->key);
+	lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name);
+	if (!lock_name)
+		lock_name = wq->name;
+	lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0);
+}
+
+static void wq_unregister_lockdep(struct workqueue_struct *wq)
+{
+	lockdep_unregister_key(&wq->key);
+}
+
+static void wq_free_lockdep(struct workqueue_struct *wq)
+{
+	if (wq->lock_name != wq->name)
+		kfree(wq->lock_name);
+}
+#else
+static void wq_init_lockdep(struct workqueue_struct *wq)
+{
+}
+
+static void wq_unregister_lockdep(struct workqueue_struct *wq)
+{
+}
+
+static void wq_free_lockdep(struct workqueue_struct *wq)
+{
+}
+#endif
+
 static void rcu_free_wq(struct rcu_head *rcu)
 {
 	struct workqueue_struct *wq =
 		container_of(rcu, struct workqueue_struct, rcu);
 
+	wq_free_lockdep(wq);
+
 	if (!(wq->flags & WQ_UNBOUND))
 		free_percpu(wq->cpu_pwqs);
 	else
@@ -3532,8 +3572,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
 	 * If we're the last pwq going away, @wq is already dead and no one
 	 * is gonna access it anymore.  Schedule RCU free.
 	 */
-	if (is_last)
+	if (is_last) {
+		wq_unregister_lockdep(wq);
 		call_rcu(&wq->rcu, rcu_free_wq);
+	}
 }
 
 /**
@@ -4067,11 +4109,9 @@ static int init_rescuer(struct workqueue_struct *wq)
 	return 0;
 }
 
-struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
-					       unsigned int flags,
-					       int max_active,
-					       struct lock_class_key *key,
-					       const char *lock_name, ...)
+struct workqueue_struct *alloc_workqueue(const char *fmt,
+					 unsigned int flags,
+					 int max_active, ...)
 {
 	size_t tbl_size = 0;
 	va_list args;
@@ -4106,7 +4146,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 			goto err_free_wq;
 	}
 
-	va_start(args, lock_name);
+	va_start(args, max_active);
 	vsnprintf(wq->name, sizeof(wq->name), fmt, args);
 	va_end(args);
 
@@ -4123,7 +4163,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 	INIT_LIST_HEAD(&wq->flusher_overflow);
 	INIT_LIST_HEAD(&wq->maydays);
 
-	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
+	wq_init_lockdep(wq);
 	INIT_LIST_HEAD(&wq->list);
 
 	if (alloc_and_link_pwqs(wq) < 0)
@@ -4161,7 +4201,7 @@ err_destroy:
 	destroy_workqueue(wq);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
+EXPORT_SYMBOL_GPL(alloc_workqueue);
 
 /**
  * destroy_workqueue - safely terminate a workqueue
@@ -4214,6 +4254,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 		kthread_stop(wq->rescuer->task);
 
 	if (!(wq->flags & WQ_UNBOUND)) {
+		wq_unregister_lockdep(wq);
 		/*
 		 * The base ref is never dropped on per-cpu pwqs.  Directly
 		 * schedule RCU free.
-- 
cgit v1.3-7-g2ca7


From bf393fd4a3c888e6d407968f461900481bd0c041 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 1 Mar 2019 13:57:25 -0800
Subject: workqueue: Fix spelling in source code comments

Change "execuing" into "executing" and "guarnateed" into "guaranteed".

Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 29a4de4025be..1ef68ac89162 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1321,7 +1321,7 @@ static bool is_chained_work(struct workqueue_struct *wq)
 
 	worker = current_wq_worker();
 	/*
-	 * Return %true iff I'm a worker execuing a work item on @wq.  If
+	 * Return %true iff I'm a worker executing a work item on @wq.  If
 	 * I'm @worker, it's safe to dereference it without locking.
 	 */
 	return worker && worker->current_pwq->wq == wq;
@@ -1619,7 +1619,7 @@ static void rcu_work_rcufn(struct rcu_head *rcu)
  *
  * Return: %false if @rwork was already pending, %true otherwise.  Note
  * that a full RCU grace period is guaranteed only after a %true return.
- * While @rwork is guarnateed to be executed after a %false return, the
+ * While @rwork is guaranteed to be executed after a %false return, the
  * execution may happen before a full RCU grace period has passed.
  */
 bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
-- 
cgit v1.3-7-g2ca7


From 4b0470027528ba98f9617f4ceba328de71d2fe49 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 7 Mar 2019 16:29:30 -0800
Subject: kernel: workqueue: clarify wq_worker_last_func() caller requirements

This function can only be called safely from very specific scheduler
contexts.  Document those.

Link: http://lkml.kernel.org/r/20190206150528.31198-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 56814902bc56..d51c37dd9422 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -920,6 +920,16 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
  * CONTEXT:
  * spin_lock_irq(rq->lock)
  *
+ * This function is called during schedule() when a kworker is going
+ * to sleep. It's used by psi to identify aggregation workers during
+ * dequeuing, to allow periodic aggregation to shut-off when that
+ * worker is the last task in the system or cgroup to go to sleep.
+ *
+ * As this function doesn't involve any workqueue-related locking, it
+ * only returns stable values when called from inside the scheduler's
+ * queuing and dequeuing paths, when @task, which must be a kworker,
+ * is guaranteed to not be processing any works.
+ *
  * Return:
  * The last work function %current executed as a worker, NULL if it
  * hasn't executed any work yet.
-- 
cgit v1.3-7-g2ca7


From 009bb421b6ceb7916ce627023d0eb7ced04c8910 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Sun, 3 Mar 2019 14:00:46 -0800
Subject: workqueue, lockdep: Fix an alloc_workqueue() error path

This patch fixes a use-after-free and a memory leak in an alloc_workqueue()
error path.

Repoted by syzkaller and KASAN:

  BUG: KASAN: use-after-free in __read_once_size include/linux/compiler.h:197 [inline]
  BUG: KASAN: use-after-free in lockdep_register_key+0x3b9/0x490 kernel/locking/lockdep.c:1023
  Read of size 8 at addr ffff888090fc2698 by task syz-executor134/7858

  CPU: 1 PID: 7858 Comm: syz-executor134 Not tainted 5.0.0-rc8-next-20190301 #1
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
  Call Trace:
   __dump_stack lib/dump_stack.c:77 [inline]
   dump_stack+0x172/0x1f0 lib/dump_stack.c:113
   print_address_description.cold+0x7c/0x20d mm/kasan/report.c:187
   kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317
   __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132
   __read_once_size include/linux/compiler.h:197 [inline]
   lockdep_register_key+0x3b9/0x490 kernel/locking/lockdep.c:1023
   wq_init_lockdep kernel/workqueue.c:3444 [inline]
   alloc_workqueue+0x427/0xe70 kernel/workqueue.c:4263
   ucma_open+0x76/0x290 drivers/infiniband/core/ucma.c:1732
   misc_open+0x398/0x4c0 drivers/char/misc.c:141
   chrdev_open+0x247/0x6b0 fs/char_dev.c:417
   do_dentry_open+0x488/0x1160 fs/open.c:771
   vfs_open+0xa0/0xd0 fs/open.c:880
   do_last fs/namei.c:3416 [inline]
   path_openat+0x10e9/0x46e0 fs/namei.c:3533
   do_filp_open+0x1a1/0x280 fs/namei.c:3563
   do_sys_open+0x3fe/0x5d0 fs/open.c:1063
   __do_sys_openat fs/open.c:1090 [inline]
   __se_sys_openat fs/open.c:1084 [inline]
   __x64_sys_openat+0x9d/0x100 fs/open.c:1084
   do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

  Allocated by task 7789:
   save_stack+0x45/0xd0 mm/kasan/common.c:75
   set_track mm/kasan/common.c:87 [inline]
   __kasan_kmalloc mm/kasan/common.c:497 [inline]
   __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:470
   kasan_kmalloc+0x9/0x10 mm/kasan/common.c:511
   __do_kmalloc mm/slab.c:3726 [inline]
   __kmalloc+0x15c/0x740 mm/slab.c:3735
   kmalloc include/linux/slab.h:553 [inline]
   kzalloc include/linux/slab.h:743 [inline]
   alloc_workqueue+0x13c/0xe70 kernel/workqueue.c:4236
   ucma_open+0x76/0x290 drivers/infiniband/core/ucma.c:1732
   misc_open+0x398/0x4c0 drivers/char/misc.c:141
   chrdev_open+0x247/0x6b0 fs/char_dev.c:417
   do_dentry_open+0x488/0x1160 fs/open.c:771
   vfs_open+0xa0/0xd0 fs/open.c:880
   do_last fs/namei.c:3416 [inline]
   path_openat+0x10e9/0x46e0 fs/namei.c:3533
   do_filp_open+0x1a1/0x280 fs/namei.c:3563
   do_sys_open+0x3fe/0x5d0 fs/open.c:1063
   __do_sys_openat fs/open.c:1090 [inline]
   __se_sys_openat fs/open.c:1084 [inline]
   __x64_sys_openat+0x9d/0x100 fs/open.c:1084
   do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

  Freed by task 7789:
   save_stack+0x45/0xd0 mm/kasan/common.c:75
   set_track mm/kasan/common.c:87 [inline]
   __kasan_slab_free+0x102/0x150 mm/kasan/common.c:459
   kasan_slab_free+0xe/0x10 mm/kasan/common.c:467
   __cache_free mm/slab.c:3498 [inline]
   kfree+0xcf/0x230 mm/slab.c:3821
   alloc_workqueue+0xc3e/0xe70 kernel/workqueue.c:4295
   ucma_open+0x76/0x290 drivers/infiniband/core/ucma.c:1732
   misc_open+0x398/0x4c0 drivers/char/misc.c:141
   chrdev_open+0x247/0x6b0 fs/char_dev.c:417
   do_dentry_open+0x488/0x1160 fs/open.c:771
   vfs_open+0xa0/0xd0 fs/open.c:880
   do_last fs/namei.c:3416 [inline]
   path_openat+0x10e9/0x46e0 fs/namei.c:3533
   do_filp_open+0x1a1/0x280 fs/namei.c:3563
   do_sys_open+0x3fe/0x5d0 fs/open.c:1063
   __do_sys_openat fs/open.c:1090 [inline]
   __se_sys_openat fs/open.c:1084 [inline]
   __x64_sys_openat+0x9d/0x100 fs/open.c:1084
   do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

  The buggy address belongs to the object at ffff888090fc2580
   which belongs to the cache kmalloc-512 of size 512
  The buggy address is located 280 bytes inside of
   512-byte region [ffff888090fc2580, ffff888090fc2780)

Reported-by: syzbot+17335689e239ce135d8b@syzkaller.appspotmail.com
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Fixes: 669de8bda87b ("kernel/workqueue: Use dynamic lockdep keys for workqueues")
Link: https://lkml.kernel.org/r/20190303220046.29448-1-bvanassche@acm.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/workqueue.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e163e7a7f5e5..ecdd5e6f2166 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4194,6 +4194,8 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 	return wq;
 
 err_free_wq:
+	wq_unregister_lockdep(wq);
+	wq_free_lockdep(wq);
 	free_workqueue_attrs(wq->unbound_attrs);
 	kfree(wq);
 	return NULL;
-- 
cgit v1.3-7-g2ca7


From 69a106c00e8554a7e6b3f4bb2967332670f89337 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Wed, 6 Mar 2019 19:27:31 -0500
Subject: workqueue, lockdep: Fix a memory leak in wq->lock_name

The following commit:

  669de8bda87b ("kernel/workqueue: Use dynamic lockdep keys for workqueues")

introduced a memory leak as wq_free_lockdep() calls kfree(wq->lock_name),
but wq_init_lockdep() does not point wq->lock_name to the newly allocated
slab object.

This can be reproduced by running LTP fallocate04 followed by oom01 tests:

 unreferenced object 0xc0000005876384d8 (size 64):
  comm "fallocate04", pid 26972, jiffies 4297139141 (age 40370.480s)
  hex dump (first 32 bytes):
    28 77 71 5f 63 6f 6d 70 6c 65 74 69 6f 6e 29 65  (wq_completion)e
    78 74 34 2d 72 73 76 2d 63 6f 6e 76 65 72 73 69  xt4-rsv-conversi
  backtrace:
    [<00000000cb452883>] kvasprintf+0x6c/0xe0
    [<000000004654ddac>] kasprintf+0x34/0x60
    [<000000001c68f311>] alloc_workqueue+0x1f8/0x6ac
    [<0000000003c2ad83>] ext4_fill_super+0x23d4/0x3c80 [ext4]
    [<0000000006610538>] mount_bdev+0x25c/0x290
    [<00000000bcf955ec>] ext4_mount+0x28/0x50 [ext4]
    [<0000000016e08fd3>] legacy_get_tree+0x4c/0xb0
    [<0000000042b6a5fc>] vfs_get_tree+0x6c/0x190
    [<00000000268ab022>] do_mount+0xb9c/0x1100
    [<00000000698e6898>] ksys_mount+0x158/0x180
    [<0000000064e391fd>] sys_mount+0x20/0x30
    [<00000000ba378f12>] system_call+0x5c/0x70

Signed-off-by: Qian Cai <cai@lca.pw>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: catalin.marinas@arm.com
Cc: jiangshanlai@gmail.com
Cc: tj@kernel.org
Fixes: 669de8bda87b ("kernel/workqueue: Use dynamic lockdep keys for workqueues")
Link: https://lkml.kernel.org/r/20190307002731.47371-1-cai@lca.pw
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/workqueue.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/workqueue.c')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ecdd5e6f2166..e14da4f599ab 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3348,6 +3348,8 @@ static void wq_init_lockdep(struct workqueue_struct *wq)
 	lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name);
 	if (!lock_name)
 		lock_name = wq->name;
+
+	wq->lock_name = lock_name;
 	lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0);
 }
 
-- 
cgit v1.3-7-g2ca7